In [3]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

# Load environment variables from hf_token.env file
load_dotenv('hf_token.env')

# Get token from environment variable
token = os.getenv('HUGGINGFACE_TOKEN')
if token:
    login(token=token)
    print("Successfully logged in to Hugging Face Hub")
else:
    print("Please set HUGGINGFACE_TOKEN in hf_token.env file")

Successfully logged in to Hugging Face Hub


#### Raw Transformers API

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-mini-instruct", torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")

In [None]:
prompt = "What is the capital of France?"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

#### Pipeline

In [None]:
from transformers import pipeline

pipeline = pipeline("text-generation", model="microsoft/Phi-4-mini-instruct", device="cpu")

prompt = "What is the capital of France?"
outputs = pipeline(prompt, max_new_tokens=100)
print(outputs[0]["generated_text"])

#### Trainer

Complete training and evaluation loop for Pytorch models.  
You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset.

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-mini-instruct", torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
dataset = load_dataset("rotten_tomatoes")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})


In [5]:
# Print top 10 values from train dataset
print("Top 10 values from train dataset:")
top_10 = dataset["train"][:10]
for i, (txt, lbl) in enumerate(zip(top_10["text"], top_10["label"])):
    print (f"{i}: label={lbl} text={txt}")

Top 10 values from train dataset:
0: label=1 text=the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
1: label=1 text=the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .
2: label=1 text=effective but too-tepid biopic
3: label=1 text=if you sometimes like to go to the movies to have fun , wasabi is a good place to start .
4: label=1 text=emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .
5: label=1 text=the film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .
6: label=1 text=offers that rare combination of entertainment and education .
7: label=1 text

In [6]:
# Print top 10 values from test dataset
print("Top 10 values from test dataset:")
top_10 = dataset["test"][:10]
for i, (txt, lbl) in enumerate(zip(top_10["text"], top_10["label"])):
    print (f"{i}: label={lbl} text={txt}")

Top 10 values from test dataset:
0: label=1 text=lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
1: label=1 text=consistently clever and suspenseful .
2: label=1 text=it's like a " big chill " reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .
3: label=1 text=the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with tremendous skill .
4: label=1 text=red dragon " never cuts corners .
5: label=1 text=fresnadillo has something serious to say about the ways in which extravagant chance can distort our perspective and throw us off the path of good sense .
6: label=1 text=throws in enough clever and unexpected twists to make the formula feel fresh .
7: label=1 text=weighty and ponderous but every bit as filling as the treat of the title .
8: label=1 text=a real audience-pleaser that will strike 

In [12]:
# Create a function to tokenize the text and convert it into PyTorch tensors.

def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])
dataset = dataset.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [13]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
})


In [15]:
# Print top 10 values from train pytorch dataset
print("Top 10 values from train pytorch dataset:")
top_10 = dataset["train"][:10]
for i, (txt, lbl, input_ids, attention_mask) in enumerate(zip(top_10["text"], top_10["label"], top_10["input_ids"], top_10["attention_mask"])):
    print (f"{i}: label={lbl} text={txt} input_ids={input_ids} attention_mask={attention_mask}")

Top 10 values from train pytorch dataset:
0: label=1 text=the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . input_ids=[3086, 8707, 382, 96084, 316, 413, 290, 220, 2040, 302, 14015, 885, 620, 392, 406, 270, 392, 326, 484, 19016, 2966, 316, 1520, 261, 58030, 1952, 10740, 1572, 170183, 1280, 165960, 180310, 1366, 137348, 12, 16530, 527, 1164, 4818, 1047, 503, 2310, 1066, 3055, 280, 887] attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
1: label=1 text=the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . input_ids=[3086, 131574, 10701, 65742, 74139, 328, 392, 290, 68243, 328, 290, 35188, 392

In [16]:
# Load a data collator to create batches of data and pass the tokenizer to it.

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# Next, set up TrainingArguments with the training features and hyperparameters.

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="phi4-rotten-tomatoes",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    push_to_hub=True,
)

In [18]:
# pass all these separate components to Trainer and call train() to start.

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


ValueError: Expected input batch_size (312) to match target batch_size (8).

In [None]:
# Share your model and tokenizer to the Hub with push_to_hub().
trainer.push_to_hub()