In [1]:
! pip install torch



#### General library importation and variable initialization

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from itertools import zip_longest

In [3]:
EPOCH = 32
BATCH_SIZE = 64
MAX_INPUT_LENGTH = 65
MAX_LABEL_LENGTH = 8
MODEL_LINK = "google/flan-t5-small"
SEED = 0

In [4]:
def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
set_seed()
tokenizer = T5Tokenizer.from_pretrained(MODEL_LINK, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(MODEL_LINK).to('cuda')

tokenizer_config.json: 0.00B [00:00, ?B/s]

(…)a5b18a05535c9e14c7a355904270e15b0945ea86:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

#### Reading data from datasets and Data cleaning tasks

In [5]:
mapping = {
    2: 'very good',
    1: 'good',
    0: 'neutral',
    -1: 'bad',
    -2: 'very bad'
}

In [6]:
def dataStructured(ds):
    ds.columns = ['review', 'category']
    ds['category'] = ds['category'].replace(mapping).astype(str)
    return ds

In [7]:
original_ds  = dataStructured(pd.read_csv('/kaggle/input/original-sentipers/original.csv'))
test_ds      = dataStructured(pd.read_csv('/kaggle/input/test-sentipers/test.csv'))
balanced_ds  = dataStructured(pd.read_csv('/kaggle/input/balanced-sentipers/balanced.csv'))
translated_ds= dataStructured(pd.read_csv('/kaggle/input/translation-sentipers/translation.csv'))

#### Fine-Tuning requirements for the specified model

In [8]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length4text, max_length4label):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length4text = max_length4text
        self.max_length4label = max_length4label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.data['review'].iloc[idx]
        category = self.data['category'].iloc[idx]
        
        # tokenizing input review
        review_encoding = self.tokenizer(review, truncation=True, padding='max_length', max_length=self.max_length4text, return_tensors='pt').to("cuda")
        
        # tokenizing category
        category_encoding = self.tokenizer(category, truncation=True, padding='max_length', max_length=self.max_length4label, return_tensors='pt').to("cuda")
        return {
            # agent review
            'review_input_id': review_encoding['input_ids'].squeeze(),
            'review_attention_mask': review_encoding['attention_mask'].squeeze(),
            # category
            'category_id': category_encoding['input_ids'].squeeze(),
            'category_mask': category_encoding['attention_mask'].squeeze()
        }

In [9]:
# Defining the loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [10]:
def calculateF1(prediction_list, actual_list):
    # Flatten lists if needed (if you've accidentally appended lists instead of values)
    if isinstance(prediction_list[0], list):
        prediction_list = [item for sublist in prediction_list for item in sublist]
    if isinstance(actual_list[0], list):
        actual_list = [item for sublist in actual_list for item in sublist]
    
    assert len(prediction_list) == len(actual_list), "Length mismatch between predictions and actuals"

    try:
        f1 = f1_score(actual_list, prediction_list, average='macro')  # or 'weighted' depending on class balance
    except Exception as e:
        print(f"Error calculating F1: {e}")
        f1 = 0.0
    
    return f1

def evaluateModel(model, dataLoader, tokenizer):
    model.eval()
    actual_list, prediction_list = [], []

    with torch.no_grad():
        for batch in dataLoader:
            ids = batch['review_input_id']
            mask = batch['review_attention_mask']
            output_id = batch['category_id']

            actuals = tokenizer.batch_decode(output_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            generated_output = model.generate(input_ids=ids, attention_mask=mask, max_length=64)
            preds = tokenizer.batch_decode(generated_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)

            actual_list.extend(actuals)  # Make sure clear_data returns a list of strings/labels
            prediction_list.extend(preds)

    return calculateF1(prediction_list, actual_list)

def trainModel(model, data_loader, vDataLoader, tokenizer, optimizer, EPOCH):
    for epoch in range(EPOCH):
        model.train()
        losses = []

        for batch in data_loader:
            review_input = batch['review_input_id']
            review_attention_mask = batch['review_attention_mask']
            category_id = batch['category_id']

            optimizer.zero_grad()

            output = model(input_ids=review_input, attention_mask=review_attention_mask, labels=category_id)
            loss = output.loss
            losses.append(loss.item())

            loss.backward()
            optimizer.step()
        
        f1 = evaluateModel(model, vDataLoader, tokenizer)
        print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f} - Avg Loss: {sum(losses)/len(losses):.4f}")

In [11]:
# Shuffle and split
original_ds = original_ds.sample(frac = 1, random_state = SEED).reset_index(drop=True)

val_size = int(len(original_ds) * 0.15)
validation_ds = original_ds.iloc[:val_size]
train_ds = original_ds.iloc[val_size:]


In [12]:
# Create a data loader for TRAIN dataframe
train_dataset = CustomDataset(train_ds, tokenizer, max_length4text= MAX_INPUT_LENGTH, max_length4label= MAX_LABEL_LENGTH)
train_data_loader = DataLoader(train_dataset, batch_size= BATCH_SIZE, shuffle=True)

# Create a data loader for EVALUATION dataframe
val_dataset = CustomDataset(validation_ds, tokenizer, max_length4text= MAX_INPUT_LENGTH, max_length4label= MAX_LABEL_LENGTH)
val_data_loader = DataLoader(val_dataset, batch_size= BATCH_SIZE, shuffle=False)

# Create a data loader for TEST dataframe
test_dataset = CustomDataset(test_ds, tokenizer, max_length4text= MAX_INPUT_LENGTH, max_length4label= MAX_LABEL_LENGTH)
test_data_loader = DataLoader(test_dataset, batch_size= BATCH_SIZE, shuffle=False)

In [13]:
trainModel(model, train_data_loader, val_data_loader, tokenizer, optimizer, EPOCH)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1 - F1 Score: 0.1472 - Avg Loss: 1.1682
Epoch 2 - F1 Score: 0.1677 - Avg Loss: 0.1678
Epoch 3 - F1 Score: 0.1981 - Avg Loss: 0.1648
Epoch 4 - F1 Score: 0.1723 - Avg Loss: 0.1639
Epoch 5 - F1 Score: 0.1725 - Avg Loss: 0.1617
Epoch 6 - F1 Score: 0.1970 - Avg Loss: 0.1612
Epoch 7 - F1 Score: 0.1894 - Avg Loss: 0.1591
Epoch 8 - F1 Score: 0.1799 - Avg Loss: 0.1596
Epoch 9 - F1 Score: 0.1968 - Avg Loss: 0.1582
Epoch 10 - F1 Score: 0.1748 - Avg Loss: 0.1586
Epoch 11 - F1 Score: 0.1802 - Avg Loss: 0.1580
Epoch 12 - F1 Score: 0.1851 - Avg Loss: 0.1553
Epoch 13 - F1 Score: 0.1490 - Avg Loss: 0.1554
Epoch 14 - F1 Score: 0.1997 - Avg Loss: 0.1566
Epoch 15 - F1 Score: 0.2004 - Avg Loss: 0.1577
Epoch 16 - F1 Score: 0.1804 - Avg Loss: 0.1610
Epoch 17 - F1 Score: 0.1229 - Avg Loss: 0.1606
Epoch 18 - F1 Score: 0.1229 - Avg Loss: 0.1589
Epoch 19 - F1 Score: 0.1229 - Avg Loss: 0.1593
Epoch 20 - F1 Score: 0.1229 - Avg Loss: 0.1596
Epoch 21 - F1 Score: 0.1227 - Avg Loss: 0.1596
Epoch 22 - F1 Score: 0