# LLM Model for Sentiment Analysis - T5 (Text-to-Text Transfer Transformer)

# Library Imports

In [28]:
# pip install transformers
# pip install datasets
# pip install sentencepiece
# pip install evaluate

import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer # tokenizer
from transformers import T5ForConditionalGeneration # model
from transformers import Trainer, TrainingArguments # training
import evaluate
import torch

# Data Loading and Train-Test-Split

In [12]:
df = pd.read_csv("DisneylandReviews.csv",
                 encoding = 'latin-1')
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [None]:
# convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

In [21]:
# train-test split

# 80-20 split train-test
train_valid = dataset.train_test_split(test_size = 0.2,
                                                seed = 42)

# 50-50 split validation-test
test_valid = train_valid['test'].train_test_split(test_size = 0.5,
                                            seed = 42)

# full dataset
full_data = DatasetDict({
    'train': train_valid['train'], # training dataset (80%)
    'validation': test_valid['train'], # validation dataset (10%)
    'test': test_valid['test'] # testing set
})

In [22]:
# inspect
full_data

DatasetDict({
    train: Dataset({
        features: ['Review_ID', 'Rating', 'Year_Month', 'Reviewer_Location', 'Review_Text', 'Branch'],
        num_rows: 34124
    })
    validation: Dataset({
        features: ['Review_ID', 'Rating', 'Year_Month', 'Reviewer_Location', 'Review_Text', 'Branch'],
        num_rows: 4266
    })
    test: Dataset({
        features: ['Review_ID', 'Rating', 'Year_Month', 'Reviewer_Location', 'Review_Text', 'Branch'],
        num_rows: 4266
    })
})

# Preprocessing

In [19]:
# load tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [23]:
# define preprocessing function
def preprocess_data(df):

    # task
    inputs = ['Classify sentiment: ' + text for text in df["Review_Text"]]

    targets = [str(label) for label in df['Rating']] # convert labels to string

    # tokenize inputs
    model_inputs = tokenizer(inputs,
                             max_length = 128,
                             truncation = True,
                             padding = 'max_length')
    
    # tokenize outputs
    labels = tokenizer(targets,
                       max_length = 10,
                       truncation = True,
                       padding = 'max_length')
    
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

# get preprocessed dataset
tokenized_datasets = full_data.map(preprocess_data,
                                   batched = True)

Map: 100%|██████████| 34124/34124 [00:23<00:00, 1462.59 examples/s]
Map: 100%|██████████| 4266/4266 [00:02<00:00, 1648.58 examples/s]
Map: 100%|██████████| 4266/4266 [00:02<00:00, 1579.36 examples/s]


# T5 (Text-to-Text Transfer Transformer) Model

In [24]:
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

## Evaluation Metrics

In [25]:
accuracy_metric = evaluate.load('accuracy')

Downloading builder script: 4.20kB [00:00, 3.64MB/s]


In [26]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    
    # decode predictions
    decoded_preds = tokenizer.batch_decode(preds,
                                          skip_special_tokens = True)
    decoded_labels = tokenizer.batch_decode(labels,
                                           skip_special_tokens = True)
    
    # convert to integers
    decoded_preds = [int(p) for p in decoded_preds]
    decoded_labels = [int(l) for l in decoded_labels]

    return accuracy_metrics.compute(predictions = decoded_preds,
                                    references = decoded_labels)

## Training Arguments

In [30]:
training_args = TrainingArguments(
    output_dir = "./t5-sentiment",
    eval_strategy = "epoch",
    learning_rate = 3e-4,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    weight_decay = 0.01,
    save_total_limit = 2,
    num_train_epochs = 3,
    predict_with_generate = True,
    logging_dir = "./logs",
    fp16 = torch.cuda.is_available()  # enable mixed precision if GPU is available
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'predict_with_generate'

## Trainer

In [None]:
trainer = Trainer(
    model = t5_model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

## Model Train

In [None]:
%%time
trainer.train()

## Model Evaluate

In [None]:
results = trainer.evaluate(tokenized_datasets["test"])
results