In [3]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

# Load environment variables from hf_token.env file
load_dotenv('hf_token.env')

# Get token from environment variable
token = os.getenv('HUGGINGFACE_TOKEN')
if token:
    login(token=token)
    print("Successfully logged in to Hugging Face Hub")
else:
    print("Please set HUGGINGFACE_TOKEN in hf_token.env file")

Successfully logged in to Hugging Face Hub


#### Raw Transformers API

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-mini-instruct", torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")

In [None]:
prompt = "What is the capital of France?"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

#### Pipeline

In [None]:
from transformers import pipeline

pipeline = pipeline("text-generation", model="microsoft/Phi-4-mini-instruct", device="cpu")

prompt = "What is the capital of France?"
outputs = pipeline(prompt, max_new_tokens=100)
print(outputs[0]["generated_text"])

#### Trainer

Complete training and evaluation loop for Pytorch models.  
You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset.

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-mini-instruct", torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
dataset = load_dataset("rotten_tomatoes")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})


In [None]:
# Print top 10 values from train dataset
print("Top 10 values from train dataset:")
print(dataset["train"].select(range(min(10, len(dataset["train"]))))

# print("\nTop 10 values from test dataset:")
# print(dataset["test"].select(range(min(10, len(dataset["test"]))))

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3408154360.py, line 3)

In [None]:
# Create a function to tokenize the text and convert it into PyTorch tensors.

def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])
dataset = dataset.map(tokenize_dataset, batched=True)

In [None]:
# Load a data collator to create batches of data and pass the tokenizer to it.

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Next, set up TrainingArguments with the training features and hyperparameters.

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="distilbert-rotten-tomatoes",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    push_to_hub=True,
)

In [None]:
# pass all these separate components to Trainer and call train() to start.

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
# Share your model and tokenizer to the Hub with push_to_hub().
trainer.push_to_hub()