In [24]:
!pip install sentencepiece -q
!pip install transformers -q
!pip install torch torchvision torchaudio
!pip install rouge



In [46]:
import torch
import random
import re
import pandas as pd
import numpy as np
import os
import pickle
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.optim import SGD
from sklearn.model_selection import train_test_split
from rouge import Rouge
import torch.optim as optim

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
%env PYTORCH_NO_CUDA_MEMORY_CACHING=1
!cat /proc/sys/vm/overcommit_memory
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


env: PYTORCH_NO_CUDA_MEMORY_CACHING=1
1
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [29]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [30]:
reviews=pd.read_csv('/content/drive/MyDrive/Reviews.csv')

In [31]:
print(reviews.isna().sum())
reviews.dropna(inplace=True)
reviews['training'] = reviews['Text'].str.lower()  + 'TL;DR' + reviews['Summary'].str.lower()

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


In [32]:
reviews = reviews[['Summary','Text','training']][:5000]

In [33]:
reviews['training'][0]

'i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than  most.TL;DRgood quality dog food'

In [34]:
reviews['model_input'] = reviews['Text'] + " TL;DR " + reviews['Summary']


In [35]:
reviews['model_input'][0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. TL;DR Good Quality Dog Food'

In [36]:
avg_length = sum([len(review.split()) for review in reviews.model_input.values])/len(reviews)
avg_length

80.1608

In [37]:
max_length = 100

In [38]:
reviews = reviews['model_input'].values.tolist()

In [39]:
len(reviews)

5000

In [40]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [41]:
tokenizer.encode(" TL;DR ")

[24811, 26, 7707, 220]

In [42]:
extra_length = len(tokenizer.encode(" TL;DR "))

In [43]:
train_reviews, test_reviews = train_test_split(reviews, test_size=0.25, random_state=42)

In [44]:
class ReviewDataset(Dataset):
    def __init__(self, tokenizer, reviews, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.result = []

        for review in self.reviews:
            tokenized = self.tokenizer.encode(review + self.eos)

            padded = self.pad_truncate(tokenized)

            self.result.append(torch.tensor(padded))

    def __len__(self):
        return len(self.result)


    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, name):
        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            result = name[:self.max_len + 3]+[self.eos_id]
        else:
            result = name
        return result


In [47]:
from torch.utils.data import DataLoader

def train_and_evaluate_grid_search(model, tokenizer, train_data, learning_rate, batch_size, epochs):
    train_reviews, val_reviews = train_test_split(train_data, test_size=0.2, random_state=42)
    train_dataset = ReviewDataset(tokenizer, train_reviews, max_length)
    val_dataset = ReviewDataset(tokenizer, val_reviews, max_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    optimizer = SGD(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(epochs):
        for step, batch in enumerate(train_loader):
            batch = batch.to(device)
            outputs = model(input_ids=batch, labels=batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                outputs = model(input_ids=batch, labels=batch)
                loss = outputs.loss
                total_val_loss += loss.item()

        average_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {average_val_loss}")

    return average_val_loss


In [None]:
learning_rates = [3e-4, 1e-4]
batch_sizes = [16, 32]
num_epochs = [5, 10]

best_loss = float('inf')
best_hyperparams = {}
best_model = None

for lr in learning_rates:
    for bs in batch_sizes:
        for epochs in num_epochs:
            average_val_loss = train_and_evaluate_grid_search(model, tokenizer, train_reviews, lr, bs, epochs)
            print(f"Validation Loss for LR={lr}, BS={bs}, Epochs={epochs}: {average_val_loss}")
            if average_val_loss < best_loss:
                best_loss = average_val_loss
                best_hyperparams = {'learning_rate': lr, 'batch_size': bs, 'num_epochs': epochs}
                best_model = model.state_dict()

print("Best Hyperparameters:", best_hyperparams)

with open('best_model_new.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Best model saved to best_model_new.pkl")


Token indices sequence length is longer than the specified maximum sequence length for this model (1087 > 1024). Running this sequence through the model will result in indexing errors


Epoch 1/5, Validation Loss: 2.7017658416261066
Epoch 2/5, Validation Loss: 2.6513028347745853
Epoch 3/5, Validation Loss: 2.622822213680186
Epoch 4/5, Validation Loss: 2.6022404559115144
Epoch 5/5, Validation Loss: 2.58846164764242
Validation Loss for LR=0.0003, BS=16, Epochs=5: 2.58846164764242
Epoch 1/10, Validation Loss: 2.5864162647977786
Epoch 2/10, Validation Loss: 2.5714586998553988
Epoch 3/10, Validation Loss: 2.562268911524022
Epoch 4/10, Validation Loss: 2.5549485429804375
Epoch 5/10, Validation Loss: 2.548440360008402
Epoch 6/10, Validation Loss: 2.542212151466532
Epoch 7/10, Validation Loss: 2.537845474608401
Epoch 8/10, Validation Loss: 2.533649053979427
Epoch 9/10, Validation Loss: 2.5297666052554515
Epoch 10/10, Validation Loss: 2.5256807804107666
Validation Loss for LR=0.0003, BS=16, Epochs=10: 2.5256807804107666
Epoch 1/5, Validation Loss: 2.5203411678473153
Epoch 2/5, Validation Loss: 2.515917291243871
Epoch 3/5, Validation Loss: 2.5133501092592874
Epoch 4/5, Validati

In [None]:
best_model_file = 'best_model_new.pkl'
if os.path.exists(best_model_file):
    with open(best_model_file, 'rb') as f:
        best_model_state_dict = pickle.load(f)
else:
    raise FileNotFoundError("Best model file not found. Please train a model first.")

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.load_state_dict(best_model_state_dict)


new_learning_rate = 1e-3
new_batch_size = 64
new_num_epochs = 10


optimizer = SGD(model.parameters(), lr=new_learning_rate)

train_and_evaluate_grid_search(model, tokenizer, train_reviews, new_learning_rate, new_batch_size, new_num_epochs)

with open(best_model_file, 'wb') as f:
    pickle.dump(model.state_dict(), f)

print("Training resumed and updated best model saved.")


In [None]:
def topk(probs, n=15):
    probs = torch.softmax(probs, dim= -1)
    tokensProb, topIx = torch.topk(probs, k=n)
    tokensProb = tokensProb / torch.sum(tokensProb)
    tokensProb = tokensProb.cpu().detach().numpy()
    choice = np.random.choice(n, 1, p = tokensProb)
    tokenId = topIx[choice][0]
    return int(tokenId)

In [None]:
def model_infer(model, tokenizer, review, max_length=15):
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)
    with torch.set_grad_enabled(False):
        output = model(initial_input)
        logits = output.logits[0,-1]
        result.append(topk(logits))
        for _ in range(max_length):
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            res_id = topk(logits)
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else:
                result.append(res_id)

    return tokenizer.decode(result)

In [None]:
print(best_model.keys())

odict_keys(['transformer.wte.weight', 'transformer.wpe.weight', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.0.mlp.c_fc.bias', 'transformer.h.0.mlp.c_proj.weight', 'transformer.h.0.mlp.c_proj.bias', 'transformer.h.1.ln_1.weight', 'transformer.h.1.ln_1.bias', 'transformer.h.1.attn.c_attn.weight', 'transformer.h.1.attn.c_attn.bias', 'transformer.h.1.attn.c_proj.weight', 'transformer.h.1.attn.c_proj.bias', 'transformer.h.1.ln_2.weight', 'transformer.h.1.ln_2.bias', 'transformer.h.1.mlp.c_fc.weight', 'transformer.h.1.mlp.c_fc.bias', 'transformer.h.1.mlp.c_proj.weight', 'transformer.h.1.mlp.c_proj.bias', 'transformer.h.2.ln_1.weight', 'transformer.h.2.ln_1.bias', 'transformer.h.2.attn.c_attn.weight', 'transformer.h.2.attn.

In [None]:
with open('/content/drive/MyDrive/best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

In [None]:
model.load_state_dict(best_model)


<All keys matched successfully>

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.to(device)
model.load_state_dict(best_model)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
for review in test_reviews:
    print("Original Review: ", review)
    summary = model_infer(model, tokenizer, review + " TL;DR ").split(" TL;DR ")[1].strip()
    print("Generated Summary: ", summary)
    break

Original Review:  After reading a previous review I carefully inspected the cans when they arrived. The cans were immaculate and very clean so I felt relieved. After cooling in the fridge I ate the first can. The fruit was very mushy and not attractive to look at and frankly a disapointment but did not taste rotten per say. Only minutes later though my mouth felt like acid was eating at it and was very painful for hours even after washing out my mouth. I eat much canned and fresh fruit and this was a first for me. Only days later my teeth and gums were infected and I had to go to the dentist for the pain was so bad. He found infection and I had to undergo a treatment of antibiotics and painkillers before I was finally ok again after about 2 weeks. SHEESH! Needless to say I threw out the remaining cans of fruit! The problem must be at the source as the cans and packaging were 1st rate and shining clean. Quality control must really suck. I'm all for natural but great care must be taken s

In [None]:
while True:
    print("\nEnter your review (type 'exit' to quit):")
    review = input()
    if review.lower() == 'exit':
        break

    print("Enter your summary prompt (press Enter for default ' TL;DR '):")
    summary_prompt = input().strip()
    if not summary_prompt:
        summary_prompt = " TL;DR "

    generated_summary = model_infer(model, tokenizer, review + summary_prompt +" TL;DR ").split(" TL;DR ")[1].strip()
    print("\nGenerated Summary:", generated_summary)


Enter your review (type 'exit' to quit):
"The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability."
Enter your summary prompt (press Enter for default ' TL;DR '):
"Good for beginners but has tuning stability issues."

Generated Summary: an

Enter your review (type 'exit' to quit):
exit


In [None]:
rouge = Rouge()

scores = rouge.get_scores(generated_summary, summary_prompt)
rouge1_precision = scores[0]['rouge-1']['p']
rouge1_recall = scores[0]['rouge-1']['r']
rouge1_f1 = scores[0]['rouge-1']['f']

rouge2_precision = scores[0]['rouge-2']['p']
rouge2_recall = scores[0]['rouge-2']['r']
rouge2_f1 = scores[0]['rouge-2']['f']

rougeL_precision = scores[0]['rouge-l']['p']
rougeL_recall = scores[0]['rouge-l']['r']
rougeL_f1 = scores[0]['rouge-l']['f']

print("ROUGE-1 Precision:", rouge1_precision)
print("ROUGE-1 Recall:", rouge1_recall)
print("ROUGE-1 F1 Score:", rouge1_f1)

print("\nROUGE-2 Precision:", rouge2_precision)
print("ROUGE-2 Recall:", rouge2_recall)
print("ROUGE-2 F1 Score:", rouge2_f1)

print("\nROUGE-L Precision:", rougeL_precision)
print("ROUGE-L Recall:", rougeL_recall)
print("ROUGE-L F1 Score:", rougeL_f1)