# Preparing for fine-tunning

## Tokenizing text

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from datasets import load_dataset 

train_data = load_dataset("imdb", split="train") 
train_data = train_data.shard(num_shards=4, index=0) 
test_data = load_dataset("imdb", split="test") 
test_data = test_data.shard(num_shards=4, index=0)

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") 
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the data
tokenized_training_data = tokenizer(str(train_data["text"]), return_tensors="pt", padding=True, truncation=True, max_length=64)
tokenized_test_data = tokenizer(str(test_data["text"]), return_tensors="pt", padding=True, truncation=True, max_length=64)

print(tokenized_training_data)
print(tokenized_test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[  101,  5930,  1006,  1031,  1005,  1045, 12524,  1045,  2572,  8025,
          1011,  3756,  2013,  2026,  2678,  3573,  2138,  1997,  2035,  1996,
          6704,  2008,  5129,  2009,  2043,  2009,  2001,  2034,  2207,  1999,
          3476,  1012,  1045,  2036,  2657,  2008,  2012,  2034,  2009,  2001,
          8243,  2011,  1057,  1012,  1055,  1012,  8205,  2065,  2009,  2412,
          2699,  2000,  4607,  2023,  2406,  1010,  3568,  2108,  1037,  5470,
          1997,  3152,  2641,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## Mapping tokenization

In [7]:
# Complete the function
def tokenize_function(data):
    return tokenizer(data["text"], 
                     return_tensors="pt", 
                     padding=True, 
                     truncation=True, 
                     max_length=64)

tokenized_in_batches = train_data.map(tokenize_function, batched=True)

print(tokenized_in_batches)

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6250
})


In [8]:
# Complete the function
def tokenize_function(data):
    return tokenizer(data["text"], 
                     return_tensors="pt", 
                     padding=True, 
                     truncation=True, 
                     max_length=64)

# Tokenize row by row
tokenized_by_row = train_data.map(tokenize_function, batched=False)

print(tokenized_by_row)

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6250
})


# Fine-tunning through training

## Setting up training arguments

In [9]:
from transformers import Trainer,  TrainingArguments

# Set up an instance of TrainingArguments
training_args = TrainingArguments(   
  output_dir="./finetuned",
  eval_strategy="epoch",
  num_train_epochs=3,   
  learning_rate=2e-5,     
  per_device_train_batch_size=8,   
  per_device_eval_batch_size=8,
  weight_decay=0.01, 
)

## Setting up the trainer

In [10]:
# Prepare the datasets properly
def tokenize_and_format(examples):
    # Tokenize the texts
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=64,
        return_tensors=None  # We don't want PT tensors yet
    )
    
    # Add labels
    tokenized["labels"] = examples["label"]
    
    return tokenized

# Apply the tokenization to our datasets
tokenized_train = train_data.map(tokenize_and_format, batched=True)
tokenized_test = test_data.map(tokenize_and_format, batched=True)

# Remove unnecessary columns and set format
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

# Set the format of our datasets to PyTorch
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

In [11]:
# Set up the trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=lambda pred: {
        "accuracy": (pred.predictions.argmax(-1) == pred.label_ids).mean().item()
    }
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0067,2.9e-05,1.0
2,0.0,1.4e-05,1.0
3,0.0,1.1e-05,1.0


  trainer = Trainer(


Epoch,Training Loss,Validation Loss




  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0067,2.9e-05,1.0
2,0.0,1.4e-05,1.0
3,0.0,1.1e-05,1.0




TrainOutput(global_step=2346, training_loss=0.001443281663559498, metrics={'train_runtime': 2134.137, 'train_samples_per_second': 8.786, 'train_steps_per_second': 1.099, 'total_flos': 616666536000000.0, 'train_loss': 0.001443281663559498, 'epoch': 3.0})

## Using the fine-tuned model

In [14]:
import torch

input_text = ["I'd just like to say, I love the product! Thank you!"]

# Tokenize the new data
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Pass the tokenized inputs through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract the new predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

label_map = {0: "Low risk", 1: "High risk"}
for i, predicted_label in enumerate(predicted_labels):
    churn_label = label_map[predicted_label]
    print(f"\n Input Text {i + 1}: {input_text[i]}")
    print(f"Predicted Label: {predicted_label}")


 Input Text 1: I'd just like to say, I love the product! Thank you!
Predicted Label: 0


# Fine-tuning approaches

## Transfer learning with one-shot learning

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load GPT-2 model and tokenizer for text generation
model_name = "gpt2"  # you can also use "gpt2-medium" or "gpt2-large" for better results
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# We need to set the pad token to the eos token for GPT-2
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [18]:
# Include an example in the input text
input_text = """
Text: "The dinner we had was great and the service too."
Classify the sentiment of this sentence as either positive or negative.
Example:
Text: "The food was delicious"
Sentiment: Positive
Text: "The dinner we had was great and the service too."
Sentiment:"""

# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Generate the completion
outputs = model.generate(
    inputs["input_ids"],
    max_length=len(inputs["input_ids"][0]) + 20,  # Allow space for the answer
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id,
    num_return_sequences=1
)

# Decode and print the result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Complete response:")
print(generated_text)

# Extract just the sentiment part (after the last "Sentiment:")
sentiment = generated_text.split("Sentiment:")[-1].strip()
print("\nExtracted sentiment:", sentiment)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Complete response:

Text: "The dinner we had was great and the service too."
Classify the sentiment of this sentence as either positive or negative.
Example:
Text: "The food was delicious"
Sentiment: Positive
Text: "The dinner we had was great and the service too."
Sentiment: Negative
Text: "The dinner we had was great and the service too."
Subject: "

Extracted sentiment: Negative
Text: "The dinner we had was great and the service too."
Subject: "
