In [1]:
!pip install transformers
!pip install spacy
!pip install tqdm





In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import string
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from transformers import Trainer, TrainingArguments, get_linear_schedule_with_warmup
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kashy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kashy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kashy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
reviews_df = pd.read_csv('Reviews.csv')

In [39]:
reviews_df = reviews_df.dropna(subset=['Summary'])

In [40]:
reviews_df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [41]:
reviews_df.shape

(568427, 10)

In [42]:
reviews_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [43]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [44]:
def remove_html_tags(text):
    if isinstance(text, str):
        clean = re.compile('<.*?>')
        return re.sub(clean, '', text)
    else:
        return ''

In [45]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text, max_length, tokenizer):
    if text is None:
        return [tokenizer.pad_token_id] * max_length

    # Remove HTML tags and punctuation
    text = remove_html_tags(text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text using SpaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Convert tokens to IDs using GPT2 tokenizer
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Replace None values with padding token ID
    input_ids = [token_id if token_id is not None else tokenizer.pad_token_id for token_id in input_ids]

    # Pad sequences to ensure equal length
    input_ids = input_ids[:max_length] + [tokenizer.pad_token_id] * (max_length - len(input_ids))

    return input_ids

In [46]:
reviews_df = reviews_df[['Text', 'Summary']]

In [47]:
data_size = int(0.01 * len(reviews_df))
reviews_df = reviews_df[:data_size]

In [48]:
reviews_df.shape

(11368, 2)

# Preprocesing two columns

In [49]:
reviews_df = reviews_df.dropna(subset=['Text', 'Summary']).reset_index(drop=True)

max_length = 128  # Define the maximum sequence length
reviews_df['Text'] = reviews_df['Text'].apply(lambda x: preprocess_text(x, max_length, tokenizer))
reviews_df['Summary'] = reviews_df['Summary'].apply(lambda x: preprocess_text(x, max_length, tokenizer))

In [50]:
reviews_df['Text'] = reviews_df['Text'].apply(lambda vector: [token if token is not None else 0 for token in vector])
reviews_df['Summary'] = reviews_df['Summary'].apply(lambda vector: [token if token is not None else 0 for token in vector])

In [51]:
reviews_df.head()

Unnamed: 0,Text,Summary
0,"[40, 17846, 50256, 50256, 9703, 19425, 11167, ...","[10248, 35013, 32942, 24602, 0, 0, 0, 0, 0, 0,..."
1,"[11167, 50256, 18242, 50256, 50256, 50256, 502...","[50256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"[50256, 14145, 14792, 50256, 2971, 50256, 5025...","[50256, 16706, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,"[5460, 21078, 50256, 50256, 40, 50256, 40, 197...","[50256, 50256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
4,"[18223, 50256, 18223, 20888, 50256, 4421, 5025...","[18223, 50256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


# Train-Test Split

In [52]:
reviews_df = reviews_df.dropna()

In [53]:
epochs = 10
learning_rate = 5e-5
batch_size = 8

# Split the dataset into train and test sets
train_size = int(0.75 * len(reviews_df))
train_df = reviews_df[:train_size]
test_df = reviews_df[train_size:]

# Building Custom Dataset

In [54]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [55]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.df.iloc[idx]['Text'])
        target_ids = torch.tensor(self.df.iloc[idx]['Summary'])

        return {'input_ids': input_ids, 'labels': target_ids}

In [56]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Initialize Datasets & Dataloaders

In [57]:
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Fine-Tuning

In [58]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [60]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    # Training
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss for Epoch {epoch+1}: {avg_train_loss}')

    model.eval()
    total_val_loss = 0
    for batch in tqdm(test_dataloader, desc=f"Validation for Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=labels)
            val_loss = outputs.loss

        total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(test_dataloader)
    print(f'Average validation loss for Epoch {epoch+1}: {avg_val_loss}')

    scheduler.step()

Epoch 1/10:   0%|          | 0/1066 [00:00<?, ?it/s]

Epoch 1/10:   0%|          | 1/1066 [00:23<6:50:58, 23.15s/it]


KeyboardInterrupt: 

# Save the Model

In [None]:
torch.save(model.state_dict(), 'fine_tuned_model.pth')