# Sentiment Analysis on IMDb with Soft Prompting [PEFT + T5]

This project fine-tunes a T5-small model on the IMDb dataset using Soft Prompting from the [PEFT library](https://github.com/huggingface/peft).  
The goal is to perform sentiment classification [positive/negative] while training only a small set of soft prompt parameters, making it efficient and lightweight.


# Imports and Device Setup

In [1]:
import subprocess
import sys

result = subprocess.run([sys.executable, '-m', 'pip', 'list'], 
                       capture_output=True, text=True)

print("ML/NLP Related Packages:\n")
keywords = ['transformers', 'torch', 'tensorflow', 'datasets', 'tokenizers', 
            'adapters', 'accelerate', 'evaluate', 'sentencepiece', 'sacremoses']

for line in result.stdout.split('\n'):
    if any(keyword in line.lower() for keyword in keywords):
        print(line)

ML/NLP Related Packages:

accelerate                1.12.0
datasets                  4.4.1
evaluate                  0.4.6
tokenizers                0.22.1
torch                     2.9.1
transformers              4.57.3


In [2]:
# General
import torch

# Dataset
import datasets
from datasets import load_dataset

# Transformers
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

# PEFT
from peft import get_peft_model, PromptEncoderConfig

# Evaluation
import evaluate


In [3]:
#Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


# Load Pre-trained Model

In [4]:
#load the pre-trained t5 model
model = AutoModelForSeq2SeqLM.from_pretrained(
    "google-t5/t5-small",
    torch_dtype=torch.float16
    )

`torch_dtype` is deprecated! Use `dtype` instead!


# Load IMDb Dataset

In [5]:
# load the IMDB splits: train, test and unsupervised (we do not use unsupervised)
dataset = load_dataset("imdb", split=['train', 'test', 'unsupervised'])

# wrap into a DatasetDict for convenience
dataset = datasets.DatasetDict({ "train" : dataset[0], "test" : dataset[1], "unsupervised" : dataset[2] })

# # split the original training set into train and validation
# dataset["train"], dataset["validation"] = dataset["train"].train_test_split(test_size=0.2, seed=42).values()

# Tokenization

In [6]:
# load T5-small tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")

# if pad token is missing, use eos token as pad
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id



In [7]:
# tokenize a batch of IMDB examples for T5.
def tokenize(batch):

  # tokenize input texts
  model_inputs = tokenizer(batch["text"], padding=False, truncation=True)

  # convert numeric labels to text labels
  label = ["negative" if label == 0 else "positive" for label in batch["label"]]

  # tokenize labels as targets
  labels = tokenizer(text_target=label, truncation=True)
  model_inputs["labels"] = labels["input_ids"]

  return model_inputs

In [8]:
# apply tokenization to all splits
tokenized_ds = dataset.map(
    tokenize,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
)

In [9]:
train_ds = tokenized_ds["train"] # train dataset
# eval_ds = tokenized_ds["validation"]
test_ds = tokenized_ds["test"]

# Prompt Encoder Setup

In [10]:
num_virtual_tokens = 50

In [11]:
#configure prompt encoder
prompt_config = PromptEncoderConfig(task_type="SEQ_2_SEQ_LM", num_virtual_tokens=num_virtual_tokens, encoder_hidden_size=64)#,intially num_virtual_tokens=20,encoder_hidden_size=128
prompt_model = get_peft_model(model, prompt_config) #integrate model with prompt configuration
prompt_model.print_trainable_parameters() #print parameter statistics

trainable params: 121,472 || all params: 60,628,096 || trainable%: 0.2004


In [12]:
# Data collator and metric
data_collator = DataCollatorForSeq2Seq(tokenizer, model=prompt_model)
from evaluate import load

accuracy = load("accuracy")


# compute accuracy from generated predictions and labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Replace -100 so labels can be decoded
    labels = [[token if token != -100 else tokenizer.pad_token_id for token in label]
              for label in labels ]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    pred_labels = [1 if pred.strip() == "positive" else 0 for pred in decoded_preds]
    true_labels = [1 if lab.strip() == "positive" else 0 for lab in decoded_labels]
    return accuracy.compute(predictions=pred_labels, references=true_labels)



In [14]:
batch_size=128
training_args = Seq2SeqTrainingArguments(
      output_dir="./soft_prompt_imdb_results",
      learning_rate=5e-3,
      per_device_train_batch_size=batch_size,
      gradient_accumulation_steps=4, #accumulate gradients over 4 steps, effective batch size = batch_size * 4
      gradient_checkpointing_kwargs={"use_reentrant": False},
      per_device_eval_batch_size=4,
      fp16=True, #half precision training
      num_train_epochs=5,
      logging_strategy="epoch",
      label_names=["labels"],
      gradient_checkpointing=True, #new #saves memory by recomputing gradients during back prop
      save_strategy="epoch", #checkpoints
      eval_strategy="no",
      # save_steps=50,
      
      # predict_with_generate=True,
        
    )

    
# Initialize Trainer
prompt_trainer = Seq2SeqTrainer(
    model=prompt_model, # Pass the PEFT model to the trainer
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  prompt_trainer = Seq2SeqTrainer(


# Train the Model

In [15]:
prompt_trainer.train()

# Evaluation

In [None]:
results = prompt_trainer.evaluate()


# Conclusion

The model achieves ~87% accuracy on IMDB sentiment classification while training only 0.2% of parameters.

