# LLM Model for Sentiment Analysis - T5 (Text-to-Text Transfer Transformer)

# Library Imports

In [1]:
# pip install transformers
# pip install datasets
# pip install sentencepiece
# pip install evaluate

import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    T5ForSequenceClassification, # T5 for specifically classification task
    TrainingArguments,
    Trainer
)
import evaluate
import torch
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# check device type
if torch.cuda.is_available():
    device = torch.device("cuda")   # NVIDIA GPU
elif torch.backends.mps.is_available():
    device = torch.device("mps")    # Apple Silicon GPU
else:
    device = torch.device("cpu")    # fallback

print("Using device:", device)

Using device: mps


# Data Loading and Train-Test-Split

In [3]:
df = pd.read_csv("DisneylandReviews.csv",
                 encoding = 'latin-1')
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [4]:
##### USE THIS TO TEST SMALLER SAMPLE #######
df = df.sample(n = 10000, 
               random_state=42)

In [5]:
# keep the text and label
df = dict(df[['Review_Text', 'Rating']])

In [6]:
# convert to Hugging Face dataset
dataset = Dataset.from_dict(df)
dataset

Dataset({
    features: ['Review_Text', 'Rating'],
    num_rows: 10000
})

# Preprocessing

In [7]:
model_name = 'google-t5/t5-small'

In [8]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
def preprocess_function(reviews):

    # task
    inputs = [f"classify sentiment: {review}" for review in reviews['Review_Text']]

    # tokenize inputs
    model_inputs = tokenizer(inputs,
                             padding = True,
                             truncation = True,
                             max_length = 128)
    
    # append labels
    model_inputs['labels'] = reviews['Rating']

    return model_inputs

# apply preprocessing
tokenized_datasets = dataset.map(preprocess_function,
                                 batched = True)

Map: 100%|██████████| 10000/10000 [00:01<00:00, 5114.50 examples/s]


In [10]:
# train test split
tokenized_datasets = tokenized_datasets.train_test_split(test_size = 0.2)
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['test']

# T5 (Text-to-Text Transfer Transformer) Model

In [11]:
t5_model = T5ForSequenceClassification.from_pretrained(model_name, 
                                                      num_labels = 5).to(device)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Arguments

In [15]:
training_args = TrainingArguments(
    output_dir = "./results",
    eval_strategy = "steps",
    eval_steps = 50,
    save_steps = 100,
    save_total_limit = 2,
    logging_steps = 10,
    learning_rate = 1e-4,
    per_device_train_batch_size = 8, # small batch to fit GPU
    per_device_eval_batch_size = 8,
    gradient_accumulation_steps = 4, # effective batch size = train batch size x gradient accumulation steps (8 x 4 = 32)
    num_train_epochs = 1,
    weight_decay = 0.01,
    report_to = 'none' # avoids unnecessary logging
 )

## Trainer

In [16]:
trainer = Trainer(
    model = t5_model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer
)


  trainer = Trainer(


## Model Train

In [17]:
%%time
trainer.train()

Step,Training Loss,Validation Loss
50,0.5665,0.515965
100,0.454,0.510038
150,0.4065,0.514912
200,0.4883,0.50311
250,0.4865,0.502424


CPU times: user 9min 40s, sys: 2min 20s, total: 12min
Wall time: 10min 45s


TrainOutput(global_step=250, training_loss=0.49530706405639646, metrics={'train_runtime': 645.7282, 'train_samples_per_second': 12.389, 'train_steps_per_second': 0.387, 'total_flos': 272313120768000.0, 'train_loss': 0.49530706405639646, 'epoch': 1.0})

In [18]:
trainer.save_model("./saved_t5_model")
tokenizer.save_pretrained("./saved_t5_model")

('./saved_t5_model/tokenizer_config.json',
 './saved_t5_model/special_tokens_map.json',
 './saved_t5_model/spiece.model',
 './saved_t5_model/added_tokens.json',
 './saved_t5_model/tokenizer.json')

## Model Inference

In [19]:
test_sentence = "I hate this amusement park."

# tokenize input
inputs = tokenizer("Classify sentiment: " + test_sentence, 
                   return_tensors="pt").to(device)

# get outputs and predicted label (highest probability)
with torch.no_grad():
    outputs = t5_model(**inputs)
    pred = outputs.logits.argmax(dim=-1).item()

print("Predicted label:", pred)

Predicted label: 4
