# LLM Model for Sentiment Analysis - T5 (Text-to-Text Transfer Transformer)

# Library Imports

In [1]:
# pip install transformers
# pip install datasets
# pip install sentencepiece
# pip install evaluate

import pandas as pd
import numpy as np

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    T5ForSequenceClassification, # T5 for specifically classification task
    TrainingArguments,
    Trainer
)

import evaluate

import torch
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# check device type
if torch.cuda.is_available():
    device = torch.device("cuda")   # NVIDIA GPU
elif torch.backends.mps.is_available():
    device = torch.device("mps")    # Apple Silicon GPU
else:
    device = torch.device("cpu")    # fallback

print("Using device:", device)

Using device: mps


# Data Loading and Train-Test-Split

In [3]:
df = pd.read_csv("DisneylandReviews.csv",
                 encoding = 'latin-1')
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [4]:
# check for any label imbalance
df['Rating'].value_counts()

Rating
5    23146
4    10775
3     5109
2     2127
1     1499
Name: count, dtype: int64

In [5]:
##### USE THIS TO TEST SMALLER SAMPLE #######

# sample size
sample_size = 1000

# 5 labels, so divide sample size for equal label proportions
df = df.groupby('Rating').sample(n = int(sample_size / 5), 
               random_state=42)

In [6]:
# keep the text and label
df = dict(df[['Review_Text', 'Rating']])

In [7]:
# convert to Hugging Face dataset
dataset = Dataset.from_dict(df)
dataset

Dataset({
    features: ['Review_Text', 'Rating'],
    num_rows: 1000
})

# Preprocessing

In [8]:
model_name = 'google-t5/t5-small'

In [9]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
def preprocess_function(reviews):

    # task
    inputs = [f"classify sentiment: {review}" for review in reviews['Review_Text']]

    # tokenize inputs
    model_inputs = tokenizer(inputs,
                             padding = True,
                             truncation = True,
                             max_length = 128)
    
    # append labels
    model_inputs['labels'] = reviews['Rating']

    return model_inputs

# apply preprocessing
tokenized_datasets = dataset.map(preprocess_function,
                                 batched = True)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 1358.33 examples/s]


In [11]:
# train test split
tokenized_datasets = tokenized_datasets.train_test_split(test_size = 0.2)
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['test']

# T5 (Text-to-Text Transfer Transformer) Model

In [12]:
t5_model = T5ForSequenceClassification.from_pretrained(model_name, 
                                                      num_labels = 5).to(device)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Evaluation Metrics

In [13]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Ensure logits are a clean numpy array
    if isinstance(logits, tuple):   # sometimes logits is a tuple
        logits = logits[0]

    logits = np.array(logits)       # force into ndarray
    labels = np.array(labels)

    preds = np.argmax(logits, axis=-1)

    acc = accuracy.compute(predictions=preds, references=labels)
    prec = precision.compute(predictions=preds, references=labels, average="weighted")
    rec = recall.compute(predictions=preds, references=labels, average="weighted")
    f1_score = f1.compute(predictions=preds, references=labels, average="weighted")

    return {
        "accuracy": acc["accuracy"],
        "precision": prec["precision"],
        "recall": rec["recall"],
        "f1": f1_score["f1"]
    }

## Training Arguments

In [15]:
training_args = TrainingArguments(
    output_dir = "./results",
    eval_strategy = "steps",
    eval_steps = 10,
    save_steps = 20,
    save_total_limit = 2,
    logging_steps = 10,
    learning_rate = 2e-5,
    per_device_train_batch_size = 8, # small batch to fit GPU
    per_device_eval_batch_size = 8,
    gradient_accumulation_steps = 4, # effective batch size = train batch size x gradient accumulation steps (8 x 4 = 32)
    num_train_epochs = 1,
    weight_decay = 0.01,
    report_to = 'none' # avoids unnecessary logging
 )

## Trainer

In [16]:
trainer = Trainer(
    model = t5_model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)


  trainer = Trainer(


## Model Train

In [17]:
%%time
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,1.3473,1.272316,0.22,0.078352,0.22,0.09777
20,1.2865,1.26375,0.22,0.083465,0.22,0.107596


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


CPU times: user 1min 15s, sys: 22.7 s, total: 1min 38s
Wall time: 1min 27s


TrainOutput(global_step=25, training_loss=1.2878399276733399, metrics={'train_runtime': 87.7101, 'train_samples_per_second': 9.121, 'train_steps_per_second': 0.285, 'total_flos': 27231312076800.0, 'train_loss': 1.2878399276733399, 'epoch': 1.0})

## Results

In [18]:
trainer.evaluate()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


{'eval_loss': 1.2623703479766846,
 'eval_accuracy': 0.23,
 'eval_precision': 0.09126470588235294,
 'eval_recall': 0.23,
 'eval_f1': 0.11565266316579147,
 'eval_runtime': 6.1236,
 'eval_samples_per_second': 32.661,
 'eval_steps_per_second': 4.083,
 'epoch': 1.0}

In [19]:
trainer.save_model("./saved_t5_model")
tokenizer.save_pretrained("./saved_t5_model")

('./saved_t5_model/tokenizer_config.json',
 './saved_t5_model/special_tokens_map.json',
 './saved_t5_model/spiece.model',
 './saved_t5_model/added_tokens.json',
 './saved_t5_model/tokenizer.json')

## Model Inference

In [20]:
test_sentence = "I hate this amusement park."

# tokenize input
inputs = tokenizer("Classify sentiment: " + test_sentence, 
                   return_tensors="pt").to(device)

# get outputs and predicted label (highest probability)
with torch.no_grad():
    outputs = t5_model(**inputs)
    pred = outputs.logits.argmax(dim = -1).item()

print("Predicted label:", pred)

Predicted label: 2
