# **Transformer approach (HuggingFace API)**

**Import necessary libraries**

In [2]:
import evaluate
from evaluate import load

import numpy as np
import os
import pandas as pd
import seaborn as sns
import torch

from openai import OpenAI

from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
from transformers import TrainingArguments, Trainer

from peft import LoraConfig, get_peft_model, PeftModel

from datasets import load_dataset, Dataset


# Text summarization is primarily evaluated through Rouge score

2025-01-23 16:35:19.669623: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-23 16:35:19.825197: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737664519.901993 1000629 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737664519.923354 1000629 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-23 16:35:20.094694: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

**Load Datasets**

In [4]:
kaggle_df = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv') # Load Dataset

**Clean Nulls**

In [5]:
print(kaggle_df.isnull().sum()) # No NULLs found in categories, reviews.text or reviews.rating

id                        0
dateAdded                 0
dateUpdated               0
name                      0
asins                     0
brand                     0
categories                0
primaryCategories         0
imageURLs                 0
keys                      0
manufacturer              0
manufacturerNumber        0
reviews.date              0
reviews.dateAdded      3948
reviews.dateSeen          0
reviews.doRecommend       0
reviews.id             4971
reviews.numHelpful        0
reviews.rating            0
reviews.sourceURLs        0
reviews.text              0
reviews.title            13
reviews.username          1
sourceURLs                0
dtype: int64


**Separate dataframe into product categories, reviews and ratings**

In [6]:
kaggle_df_categories = kaggle_df['categories'] # Categories
kaggle_df_reviews = kaggle_df['reviews.text'] # Reviews
kaggle_df_ratings = kaggle_df['reviews.rating'] # Rating

all_categories = list(kaggle_df['categories'].unique())

In [7]:
star1_summaries = kaggle_df[kaggle_df['reviews.rating'] == 1]
star1_summaries = star1_summaries[['categories', 'reviews.text', 'reviews.rating']]

star2_summaries = kaggle_df[kaggle_df['reviews.rating'] == 2]
star2_summaries = star2_summaries[['categories', 'reviews.text', 'reviews.rating']]

star3_summaries = kaggle_df[kaggle_df['reviews.rating'] == 3]
star3_summaries = star3_summaries[['categories', 'reviews.text', 'reviews.rating']]

star4_summaries = kaggle_df[kaggle_df['reviews.rating'] == 4]
star4_summaries = star4_summaries[['categories', 'reviews.text', 'reviews.rating']]

star5_summaries = kaggle_df[kaggle_df['reviews.rating'] == 5]
star5_summaries = star5_summaries[['categories', 'reviews.text', 'reviews.rating']]

display(star1_summaries)

Unnamed: 0,categories,reviews.text,reviews.rating
20,"Computers,Electronics Features,Tablets,Electro...",I was looking for a kindle whitepaper. I saw o...,1
70,"Computers,Electronics Features,Tablets,Electro...",Looking at the picture and seeing it was 8th g...,1
265,"Computers,Amazon Echo,Virtual Assistant Speake...",Purchased this device at launch (2 pack for $3...,1
361,"Computers,Amazon Echo,Virtual Assistant Speake...",I waited a couple months to review giving Amaz...,1
504,"Computers,Amazon Echo,Virtual Assistant Speake...",qc is really bad on this product and does not ...,1
...,...,...,...
4761,"Tablets,Fire Tablets,Computers & Tablets,All T...",The last 2 models of Kindle HDX 8 have been te...,1
4795,"Kindle E-readers,Electronics Features,Computer...",This is not an upgrade by any means! My three ...,1
4823,"Fire Tablets,Tablets,Computers/Tablets & Netwo...",Bought this mostly as a backup.and to read a f...,1
4865,"Fire Tablets,Tablets,Computers/Tablets & Netwo...",The last 2 models of Kindle HDX 8 have been te...,1


**Summarize the list into one summary**

In [8]:
# Summarize the list into one, function

def summaries_into_one(dataframe):
    # Initialize the model and tokenizer
    model_name = "t5-small" # model
    model = T5ForConditionalGeneration.from_pretrained(model_name, device_map={"": 0})
    
    tokenizer = T5Tokenizer.from_pretrained(model_name)

    # Join reviews into a single string
    text = "\n\n".join(dataframe)

    # Tokenize and summarize the input text. inputs is a pytorch tensor, torch.Tensor
    inputs = tokenizer.encode("summarize: " + text, return_tensors = "pt", truncation = True).to("cuda:0") # max_length = 2048 excluded

    # summary_ids is a pytorch tensor, torch.Tensor
    summary_ids = model.generate(inputs, max_length = 250, min_length = 50, length_penalty = 2.0, num_beams = 4, early_stopping = True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens = True)

    return(summary)


**Dictionary of {categories: list of summaries of reviews by stars} and ROUGE evaluation**

In [9]:
# Initialize evaluation variable
rouge = evaluate.load('rouge')

# Counters to calculate average
sumRouge1 = 0
sumRouge2 = 0
sumRougeL = 0
sumRougeLSum = 0

sum_len = 0

# Each category has a list of 5 summaries of all reviews, 1 summary per star
category_dict = {}

# Build a dictionary 
for key in all_categories:
    category_dict.setdefault(key, [])

grouped = kaggle_df.groupby(['categories', 'reviews.rating'])

# Assign the summaries to the lists
for category in all_categories: # for each product category 
    for rating in range(1, 6): # from 1 stars to 5 stars, rating
        try:
            all_reviews_same_rating = grouped.get_group((category, rating))['reviews.text']
            summ1 = summaries_into_one( all_reviews_same_rating )
            category_dict[category].append( summ1 )

            results = rouge.compute(predictions = "\n\n".join(all_reviews_same_rating)[:len(summ1)], references = summ1) # len(summ1) = 247 to avoid mismatch
            print(results)

            sumRouge1 += results['rouge1']
            sumRouge2 += results['rouge2']
            sumRougeL += results['rougeL']
            sumRougeLSum += results['rougeLsum']

            sum_len += 1

        except:
            category_dict[category].append('NULL')

print("rouge1 average:", sumRouge1 / sum_len , "- rouge2 average:", sumRouge2 / sum_len , "- rougeL average:", sumRougeL / sum_len ,"- rougeLsum average:", sumRougeLSum / sum_len)

# rouge1 average: 0.0873561269402484 - rouge2 average: 0.0 - rougeL average: 0.08728537224163033 - rougeLsum average: 0.0873561269402484

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


{'rouge1': 0.0392156862745098, 'rouge2': 0.0, 'rougeL': 0.0392156862745098, 'rougeLsum': 0.0392156862745098}
{'rouge1': 0.046511627906976744, 'rouge2': 0.0, 'rougeL': 0.046511627906976744, 'rougeLsum': 0.046511627906976744}
{'rouge1': 0.04938271604938271, 'rouge2': 0.0, 'rougeL': 0.04938271604938271, 'rougeLsum': 0.04938271604938271}
{'rouge1': 0.2184873949579832, 'rouge2': 0.0, 'rougeL': 0.2184873949579832, 'rougeLsum': 0.2184873949579832}
{'rouge1': 0.038461538461538464, 'rouge2': 0.0, 'rougeL': 0.038461538461538464, 'rougeLsum': 0.038461538461538464}
{'rouge1': 0.05533596837944664, 'rouge2': 0.0, 'rougeL': 0.05533596837944664, 'rougeLsum': 0.05533596837944664}
{'rouge1': 0.05555555555555555, 'rouge2': 0.0, 'rougeL': 0.05555555555555555, 'rougeLsum': 0.05555555555555555}
{'rouge1': 0.026415094339622643, 'rouge2': 0.0, 'rougeL': 0.026415094339622643, 'rougeLsum': 0.026415094339622643}
{'rouge1': 0.06349206349206349, 'rouge2': 0.0, 'rougeL': 0.06349206349206349, 'rougeLsum': 0.06349206

In [10]:
category_dict_df = pd.DataFrame(columns = ['Product Category', 'Rating', 'Summary of reviews'])

for key in category_dict: # for each product category 
    for rating in range(1, 6): # from 1 stars to 5 stars, rating
        category_dict_df.loc[len(category_dict_df)] = [key, rating, category_dict[key][rating - 1]]

display(category_dict_df)

Unnamed: 0,Product Category,Rating,Summary of reviews
0,"Computers,Electronics Features,Tablets,Electro...",1,the whitepaper looks Identical to the $120 mod...
1,"Computers,Electronics Features,Tablets,Electro...",2,"screen too dark The screen is too dark, and ca..."
2,"Computers,Electronics Features,Tablets,Electro...",3,
3,"Computers,Electronics Features,Tablets,Electro...",4,the kindle is good to download apps for books ...
4,"Computers,Electronics Features,Tablets,Electro...",5,the amazon Kindle is light weight and easy to ...
...,...,...,...
110,"Tablets,Fire Tablets,Electronics,iPad & Tablet...",1,very cheap and was not impressed at all never ...
111,"Tablets,Fire Tablets,Electronics,iPad & Tablet...",2,
112,"Tablets,Fire Tablets,Electronics,iPad & Tablet...",3,the battery is having more and more trouble ho...
113,"Tablets,Fire Tablets,Electronics,iPad & Tablet...",4,my daughter has had this tablet for almost 2 m...


In [11]:
# -------------------------- Evaluation example ---------------------------
rouge = evaluate.load('rouge')
candidates = ["Summarization is cool","I love Machine Learning","Good night"]

references = [
["Summarization is beneficial and cool","Summarization saves time"],
["People are getting used to Machine Learning","I think i love Machine Learning"],
["Good night everyone!","Night!"]
             ]

results = rouge.compute(predictions = candidates, references = references)
print(results)

print(results['rouge1'])
# -------------------------- Evaluation example ---------------------------

{'rouge1': 0.7833333333333332, 'rouge2': 0.5833333333333334, 'rougeL': 0.7833333333333332, 'rougeLsum': 0.7833333333333332}
0.7833333333333332


# Fine-Tuning

In [12]:
dataset = load_dataset('gopalkalpande/bbc-news-summary', split = 'train')
full_dataset = dataset.train_test_split(test_size = 0.2, shuffle = True)

text = grouped.get_group((all_categories[0], 1))['reviews.text'] # text = grouped.get_group((all_categories[0], 5))['reviews.text']
#text = "\n\n".join(text)
print(text)
print(type(text))

'''dataset_train = Dataset.from_pandas(kaggle_df)
dataset_valid = Dataset.from_pandas(category_dict_df)'''

dataset_train = full_dataset['train'] # text? # full_dataset['train'] 
dataset_valid = full_dataset['test'] # cambiar por category_dict[all_categories[0]][0]? # full_dataset['test']

print(dataset_train)
print(dataset_valid)

print(type(dataset_train))
print(type(dataset_valid))

20    I was looking for a kindle whitepaper. I saw o...
70    Looking at the picture and seeing it was 8th g...
Name: reviews.text, dtype: object
<class 'pandas.core.series.Series'>
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})
<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>


In [13]:
MODEL = 't5-small'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5small'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.

In [16]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)
 
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    
    inputs = [f"summarize: {article}" for article in examples['Articles']]

    model_inputs = tokenizer(
        inputs,
        max_length = MAX_LENGTH,
        truncation = True,
        padding = 'max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Summaries']]

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length = MAX_LENGTH,
            truncation = True,
            padding = 'max_length'
        )
 
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs
#-------------------------------------

# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function,
    batched = True,
    num_proc = NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched = True,
    num_proc = NUM_PROCS
)


Map (num_proc=4):   0%|          | 0/1779 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/445 [00:00<?, ? examples/s]



In [17]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())

print(f"{total_params:,} total parameters.")

total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)

print(f"{total_trainable_params:,} training parameters.")

60,506,624 total parameters.
60,506,624 training parameters.


**Rouge**

In [18]:

rouge = evaluate.load("rouge")
 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids
 
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens = True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)
 
    result = rouge.compute(
        predictions = decoded_preds,
        references = decoded_labels,
        use_stemmer = True,
        rouge_types = [
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )
 
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
 
    return {k: round(v, 4) for k, v in result.items()}


In [19]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim = -1)
    return pred_ids, labels

In [20]:
training_args = TrainingArguments(
    output_dir = OUT_DIR,
    num_train_epochs = EPOCHS, # number of epochs

    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,

    warmup_steps = 500,
    
    weight_decay = 0.01,

    evaluation_strategy = 'steps', # how often will evaluation be during training, each 200 steps
    eval_steps = 200,

    save_strategy = 'epoch', # how often will saving be during training, each 2 epochs
    save_total_limit = 2,

    learning_rate = 0.001,
    # dataloader_num_workers = 4 # Number of subprocesses to use for data loading
)


trainer = Trainer(
    model = model,
    
    args = training_args,

    train_dataset = tokenized_train,
    eval_dataset = tokenized_valid,

    preprocess_logits_for_metrics = preprocess_logits_for_metrics, # This is a workaround to avoid storing too many tensors that are not needed.
    compute_metrics = compute_metrics # The function that will be used to compute metrics at evaluation
)
 
history = trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,No log,0.360115,0.9043,0.8358,0.8877,225.7865
400,No log,0.345974,0.9063,0.8383,0.8908,225.809
600,1.005700,0.32987,0.9088,0.8445,0.8944,225.8067
800,1.005700,0.331684,0.9111,0.846,0.8956,225.8067
1000,0.412100,0.326174,0.9128,0.8477,0.8968,225.8067
1200,0.412100,0.31594,0.9134,0.8503,0.8985,225.8067
1400,0.412100,0.319488,0.9147,0.8513,0.8997,225.8067
1600,0.340600,0.311932,0.9143,0.8516,0.899,225.8067
1800,0.340600,0.315086,0.9156,0.8533,0.9005,225.8067
2000,0.321400,0.311092,0.9151,0.8527,0.9003,225.8067


**Training Loss: 0.200300** 
**Validation Loss: 0.392833**	
**Rouge1: 0.910000	Rouge2: 0.847000**
**Rougel:0.893600**
**Gen Len: 233.831500**

In [None]:
model_path = f"{OUT_DIR}/checkpoint-4450"  # the path where you saved your model

model = T5ForConditionalGeneration.from_pretrained(model_path)

tokenizer = T5Tokenizer.from_pretrained(OUT_DIR)

In [22]:
display(history)
display(type(history))

TrainOutput(global_step=4450, training_loss=0.36820893191219717, metrics={'train_runtime': 1494.1962, 'train_samples_per_second': 11.906, 'train_steps_per_second': 2.978, 'total_flos': 2407730648186880.0, 'train_loss': 0.36820893191219717, 'epoch': 10.0})

transformers.trainer_utils.TrainOutput