<a href="https://colab.research.google.com/github/imrohitkr/urbandelights/blob/main/Backend/summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install --quiet pytorch_lightning
!pip install --quiet  transformers
!pip install --quiet  seaborn
!pip install --quiet  wget

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.6/801.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wget (setup.py) ... [?25l[?25hdone


In [1]:
import pytorch_lightning as pl
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap


from transformers import AdamW, T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer
from tqdm.auto import tqdm

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

sns.set(style='whitegrid',palette='muted',font_scale=1.2)
rcParams['figure.figsize'] = 16, 6

In [2]:
pl.seed_everything(42)

INFO:lightning_fabric.utilities.seed:Seed set to 42


42

In [3]:
url = 'Review_for_sum.csv'

In [4]:
df = pd.read_csv(url,encoding='utf-8')

In [None]:
df.columns

In [None]:
df = df[["Text","Summary"]]
df.head()

In [None]:
df.columns

In [None]:
df.columns = ["text", "summary"]
df = df.dropna()
df.head()

In [None]:
df.shape

In [None]:
train_df, test_df = train_test_split(df,test_size=0.1)
train_df.shape,test_df.shape

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# Download NLTK resources (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')

# Initialize the Porter stemmer and stopwords list
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text (remove stopwords and perform stemming)
def preprocess_text(text):
    # Tokenize the text
    words = nltk.word_tokenize(text)

    # Remove stopwords and perform stemming
    filtered_words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]

    # Join the filtered words back into a single string
    preprocessed_text = ' '.join(filtered_words)

    return preprocessed_text

# Apply preprocessing to the text column in the dataframe
train_df['preprocessed_text'] = train_df['text'].apply(preprocess_text)
test_df['preprocessed_text'] = test_df['text'].apply(preprocess_text)

In [13]:
class ResSummaryDataset(Dataset):
    def __init__(
      self,
      data : pd.DataFrame,
      tokennizer : T5Tokenizer,
      text_max_token_len : 512,
      summary_max_token_len : 128):

      self.tokennizer = tokennizer
      self.data = data,
      self.text_max_token_len = text_max_token_len
      self.summary_max_token_len = summary_max_token_len

    def __len__(self):
      return len(self.data)

    def __getitem__(self, index : int):
      data_row = self.data[0].iloc[index]
      text = data_row["text"]

      text_encoding = self.tokennizer(
          text,
          max_length = self.text_max_token_len,
          padding = "max_length",
          truncation = True,
          return_attention_mask = True,
          return_tensors = "pt"
      )

      summary_encoding = self.tokennizer(
          data_row["summary"],
          max_length = self.summary_max_token_len,
          padding = "max_length",
          truncation = True,
          return_attention_mask = True,
          return_tensors = "pt"   # Return PyTorch tensors
      )

      labels = summary_encoding["input_ids"]
      labels[labels==0] = -100

      return dict(
          text = text,
          summary = data_row["summary"],
          text_input_ids = text_encoding["input_ids"].flatten(),
          text_attention_mask = text_encoding["attention_mask"].flatten(),
          labels = labels.flatten(),
          labels_attention_mask = summary_encoding["attention_mask"].flatten()
      )

In [14]:
class ResSummaryDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df : pd.DataFrame,
        test_df : pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size : int  = 8,
        text_max_token_len : int = 512,
        summary_max_token_len :int = 128
    ):

        super().__init__()

        self.train_df = train_df
        self.test_df = test_df

        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    # LightningModule.setup(stage=None)
    # Called at the beginning of fit (train + validate), validate, test, or predict.

    def setup(self, stage=None):

        self.train_dataset = ResSummaryDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len)

        self.test_dataset = ResSummaryDataset(
            self.test_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle= True,
            num_workers=2
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle= False,
            num_workers=2
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle= False,
            num_workers=2
        )

In [None]:
MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [16]:
text_token_counts = []
summary_token_counts = []

for _, row in train_df.iterrows():

    text_token_count = len(tokenizer.encode(row["text"]))
    text_token_counts.append(text_token_count)

    summary_token_count = len(tokenizer.encode(row["summary"]))
    summary_token_counts.append(summary_token_count)

In [17]:
N_EPOCHS = 10
BATCH_SIZE = 16

data_module = ResSummaryDataModule(train_df,test_df,tokenizer,batch_size=BATCH_SIZE)

In [18]:
class ResSummaryModel(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME,return_dict=True)

    def forward(self,input_ids,attention_mask,decoder_attention_mask, labels=None):

        output = self.model(
            input_ids,
            attention_mask = attention_mask,
            labels = labels,
            decoder_attention_mask = decoder_attention_mask
        )

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):

        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("train_loss",loss,prog_bar=True,logger=True)

        return loss


    def validation_step(self, batch, batch_idx):

        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("val_loss",loss,prog_bar=True,logger=True)

        return loss

    def test_step(self, batch, batch_idx):

        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )

        self.log("test_loss",loss,prog_bar=True,logger=True)

        return loss


    def configure_optimizers(self):
        return AdamW(self.parameters(),lr = 0.0001)

In [19]:
model = ResSummaryModel()

In [5]:
!nvidia-smi

Thu Apr 11 10:00:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [20]:
import torch
torch.cuda.is_available()

True

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

logger = TensorBoardLogger("lightning_loss",name="res-summary")



trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback],
    max_epochs=N_EPOCHS,
    accelerator="auto",
    enable_progress_bar=True
)

In [None]:
trainer.fit(model,datamodule = data_module)

In [23]:
trained_model = ResSummaryModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

trained_model.freeze()

In [24]:
def summarize_text(text):
    device = trained_model.device  # Get the device of the trained model

    text_encoding = tokenizer(
        text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    # Move input tensors to the same device as the trained model
    text_encoding = {key: value.to(device) for key, value in text_encoding.items()}

    generated_ids = trained_model.model.generate(
        input_ids=text_encoding["input_ids"],
        attention_mask=text_encoding["attention_mask"],
        max_length=150,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    # Move generated_ids back to CPU if it was on GPU
    generated_ids = generated_ids.cpu() if device.type == 'cuda' else generated_ids

    preds = [tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             for gen_id in generated_ids]

    return " ".join(preds)

In [40]:
text

"I've ordered these before and I am still thrilled with the product.  They are fresh, soft and very fragrant.  I use a lot of vanilla bean because my family loves vanilla sauce and baked goods made with the fresh bean.  You must seal these very tightly to keep them soft (I use the seal-a-meal), but with care, they will stay nice for months and you can't beat the price!  McCormick sells vanilla beans in a jar and you get two or three beans for anywhere up to $6! And the quality isn't anywhere near what these are.  I will continue to buy this product."

In [41]:
summarize_text(text)

"i've ordered these before and I am still thrilled with the product. They are fresh, soft and very fragrant."

In [43]:
text = '''I have been to your delightful restaurant twice in the past two weeks. The dishes were beautiful, and every bite was heavenly.
I had the seasonal curried chicken crepe on both occasions, and it is honestly the best tasting dish I have ever had. The complexity of the
flavor profile was impeccable!  I paired it with the triple cream brie and pear salad, and she was also divine. The greens were so crisp and vibrant,
as if they were pulled from a Cezanne painting and plated. The watermelon radish and cucumber scalloped edges did not go unnoticed and the whisper of
sweetness from the candied walnuts and port gastrique was the perfect balance to the savory lardons. I even asked our lovely server, Angie, if the chef
could prepare a side of the nightly greens, and both times they were delivered without hesitation and delectable. I cannot recommend this jewel of
Columbia enough. An exceptional culinary experience is waiting for you just over the bridge. Merci Beaucoup for your talent and hospitality'''

In [44]:
summarize_text(text)

'the seasonal curried chicken crepe is honestly the best tasting dish I have ever had. the greens were crisp and vibrant, as if they were pulled from a Cezanne painting and plated.'