In [None]:
import pandas as pd
import re
import nltk
import torch
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.nn import Embedding
from gensim.models import KeyedVectors
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import random
import torch.optim as optim
from nltk import pos_tag
import torch.nn.functional as F

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Dataset Import

In [None]:
english_data = pd.read_csv('/content/drive/MyDrive/CSE400 Dataset/summarization_english.csv')

In [None]:
english_data.head()

Unnamed: 0,URL,Headline,Summary,Article
0,https://bdnews24.com/economy/bcosycngjj,India markets regulator ups scrutiny of IPO do...,India markets regulator ups scrutiny of IPO do...,India's market regulator is increasing scrutin...
1,https://bdnews24.com/economy/0eexs2yh1d,IMF's Georgieva 'very confident' on soft landi...,IMF's Georgieva 'very confident' on soft landi...,"The International Monetary Fund is now ""very c..."
2,https://bdnews24.com/economy/y3tg67qjvd,UK employers plan smaller pay rises for 2024: ...,UK employers plan smaller pay rises for 2024: ...,British employers plan smaller pay rises over ...
3,https://bdnews24.com/economy/37nhpihijs,"EU agrees on looser fiscal rules to cut debt, ...","EU agrees on looser fiscal rules to cut debt, ...",EU member states and MEPs struck a preliminary...
4,https://bdnews24.com/economy/6k15xxeo84,US imposes sanctions for violations of Russia ...,US imposes sanctions for violations of Russia ...,The US Treasury Department said on Thursday it...


In [None]:
english_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45485 entries, 0 to 45484
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URL       45485 non-null  object
 1   Headline  45485 non-null  object
 2   Summary   45485 non-null  object
 3   Article   45485 non-null  object
dtypes: object(4)
memory usage: 1.4+ MB


## Data Preprocessing

Text Cleaning

In [None]:
def clean_text(text):
  text = re.sub(r"<.*?>", "", text)
  text = re.sub(r"[^\w\s]", "", text)
  text = text.lower()
  text = " ".join(text.split())
  return text

english_data["Article"] = english_data["Article"].apply(clean_text)
english_data["Summary"] = english_data["Summary"].apply(clean_text)

english_data.head()

Unnamed: 0,URL,Headline,Summary,Article
0,https://bdnews24.com/economy/bcosycngjj,India markets regulator ups scrutiny of IPO do...,india markets regulator ups scrutiny of ipo do...,indias market regulator is increasing scrutiny...
1,https://bdnews24.com/economy/0eexs2yh1d,IMF's Georgieva 'very confident' on soft landi...,imfs georgieva very confident on soft landing ...,the international monetary fund is now very co...
2,https://bdnews24.com/economy/y3tg67qjvd,UK employers plan smaller pay rises for 2024: ...,uk employers plan smaller pay rises for 2024 c...,british employers plan smaller pay rises over ...
3,https://bdnews24.com/economy/37nhpihijs,"EU agrees on looser fiscal rules to cut debt, ...",eu agrees on looser fiscal rules to cut debt b...,eu member states and meps struck a preliminary...
4,https://bdnews24.com/economy/6k15xxeo84,US imposes sanctions for violations of Russia ...,us imposes sanctions for violations of russia ...,the us treasury department said on thursday it...


Tokenization

In [None]:
nltk.download('punkt')

english_data["Article"] = english_data["Article"].apply(word_tokenize)
english_data["Summary"] = english_data["Summary"].apply(word_tokenize)

english_data.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,URL,Headline,Summary,Article
0,https://bdnews24.com/economy/bcosycngjj,India markets regulator ups scrutiny of IPO do...,"[india, markets, regulator, ups, scrutiny, of,...","[indias, market, regulator, is, increasing, sc..."
1,https://bdnews24.com/economy/0eexs2yh1d,IMF's Georgieva 'very confident' on soft landi...,"[imfs, georgieva, very, confident, on, soft, l...","[the, international, monetary, fund, is, now, ..."
2,https://bdnews24.com/economy/y3tg67qjvd,UK employers plan smaller pay rises for 2024: ...,"[uk, employers, plan, smaller, pay, rises, for...","[british, employers, plan, smaller, pay, rises..."
3,https://bdnews24.com/economy/37nhpihijs,"EU agrees on looser fiscal rules to cut debt, ...","[eu, agrees, on, looser, fiscal, rules, to, cu...","[eu, member, states, and, meps, struck, a, pre..."
4,https://bdnews24.com/economy/6k15xxeo84,US imposes sanctions for violations of Russia ...,"[us, imposes, sanctions, for, violations, of, ...","[the, us, treasury, department, said, on, thur..."


Removing Stopwords

In [None]:
nltk.download('stopwords')

stop_words = stopwords.words("english")
english_data["Article"] = english_data["Article"].apply(lambda x: [token for token in x if token not in stop_words])
english_data["Summary"] = english_data["Summary"].apply(lambda x: [token for token in x if token not in stop_words])

english_data.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,URL,Headline,Summary,Article
0,https://bdnews24.com/economy/bcosycngjj,India markets regulator ups scrutiny of IPO do...,"[india, markets, regulator, ups, scrutiny, ipo...","[indias, market, regulator, increasing, scruti..."
1,https://bdnews24.com/economy/0eexs2yh1d,IMF's Georgieva 'very confident' on soft landi...,"[imfs, georgieva, confident, soft, landing, se...","[international, monetary, fund, confident, glo..."
2,https://bdnews24.com/economy/y3tg67qjvd,UK employers plan smaller pay rises for 2024: ...,"[uk, employers, plan, smaller, pay, rises, 202...","[british, employers, plan, smaller, pay, rises..."
3,https://bdnews24.com/economy/37nhpihijs,"EU agrees on looser fiscal rules to cut debt, ...","[eu, agrees, looser, fiscal, rules, cut, debt,...","[eu, member, states, meps, struck, preliminary..."
4,https://bdnews24.com/economy/6k15xxeo84,US imposes sanctions for violations of Russia ...,"[us, imposes, sanctions, violations, russia, o...","[us, treasury, department, said, thursday, put..."


Lemmatization

In [None]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

english_data["Article"] = english_data["Article"].apply(lemmatize_tokens)
english_data["Summary"] = english_data["Summary"].apply(lemmatize_tokens)

english_data.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,URL,Headline,Summary,Article
0,https://bdnews24.com/economy/bcosycngjj,India markets regulator ups scrutiny of IPO do...,"[india, market, regulator, ups, scrutiny, ipo,...","[india, market, regulator, increasing, scrutin..."
1,https://bdnews24.com/economy/0eexs2yh1d,IMF's Georgieva 'very confident' on soft landi...,"[imf, georgieva, confident, soft, landing, see...","[international, monetary, fund, confident, glo..."
2,https://bdnews24.com/economy/y3tg67qjvd,UK employers plan smaller pay rises for 2024: ...,"[uk, employer, plan, smaller, pay, rise, 2024,...","[british, employer, plan, smaller, pay, rise, ..."
3,https://bdnews24.com/economy/37nhpihijs,"EU agrees on looser fiscal rules to cut debt, ...","[eu, agrees, looser, fiscal, rule, cut, debt, ...","[eu, member, state, meps, struck, preliminary,..."
4,https://bdnews24.com/economy/6k15xxeo84,US imposes sanctions for violations of Russia ...,"[u, imposes, sanction, violation, russia, oil,...","[u, treasury, department, said, thursday, put,..."


Vocabulary Creation

In [None]:
article_vocab = build_vocab_from_iterator(english_data["Article"], specials=["<pad>", "<sos>", "<eos>", "<unk>"])
summary_vocab = build_vocab_from_iterator(english_data["Summary"], specials=["<pad>", "<sos>", "<eos>", "<unk>"])

article_itos = article_vocab.get_itos()
article_stoi = article_vocab.get_stoi()
summary_itos = summary_vocab.get_itos()
summary_stoi = summary_vocab.get_stoi()

In [None]:
article_max_len = int(english_data['Article'].str.len().quantile(0.55))
summary_max_len = int(english_data['Summary'].str.len().quantile(0.60))

print("50th percentile length for Article_tokens:", article_max_len)
print("60th percentile length for Summary_tokens:", summary_max_len)

50th percentile length for Article_tokens: 311
60th percentile length for Summary_tokens: 16


Padding

In [None]:
def pad_tokens(tokens, vocab_stoi, max_len):
    pad_index = vocab_stoi["<pad>"]
    tokens = tokens[:max_len]
    sequence = [vocab_stoi.get(token, vocab_stoi["<unk>"]) for token in tokens] + [pad_index] * (max_len - len(tokens))
    return sequence

article_sequences = [pad_tokens(tokens, article_stoi, 288) for tokens in english_data["Article"]]
summary_sequences = [pad_tokens(tokens, summary_stoi, 15) for tokens in english_data["Summary"]]

article_tensor = torch.tensor(article_sequences)
summary_tensor = torch.tensor(summary_sequences)

## Word Vectorization

In [None]:
glove_vectors = KeyedVectors.load_word2vec_format("/content/drive/MyDrive/CSE400 Dataset/glove.6B.300d.txt",  no_header=True)
vocab_size = len(article_vocab)
embedding_dim = 300

pretrained_embeddings = torch.zeros(vocab_size, embedding_dim)
for word, i in article_vocab.get_stoi().items():
    if word in glove_vectors.key_to_index:
        pretrained_embeddings[i] = torch.tensor(glove_vectors[word])
    else:
        pretrained_embeddings[i] = torch.randn(embedding_dim)
pad_idx = article_vocab.get_stoi()["<pad>"]
pretrained_embeddings[pad_idx] = torch.zeros(embedding_dim)
pretrained_embeddings.requires_grad = False

embedding_layer = Embedding.from_pretrained(pretrained_embeddings, freeze=True)

## Train Test Split and Dataset Loader

In [None]:
X_train, X_test, y_train, y_test = train_test_split(article_tensor, summary_tensor, test_size=0.1, random_state=42)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, articles, summaries):
        self.articles = articles
        self.summaries = summaries

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        return self.articles[idx], self.summaries[idx]

train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

## Seq2Seq with Bahdanau Attention

In [None]:
combined_vocab = set(article_vocab.get_stoi().keys()) | set(summary_vocab.get_stoi().keys())
print("Combined vocabulary size:", len(combined_vocab))
print("Summary Vocabulary Size:", len(summary_vocab))
SOS_token = summary_stoi["<sos>"]
print("SOS Token: ", SOS_token)
PAD_token = summary_stoi["<pad>"]
print("PAD Token: ", PAD_token)
EOS_token = summary_stoi["<eos>"]
UNK_token = summary_stoi["<unk>"]

Combined vocabulary size: 184352
Summary Vocabulary Size: 39133
SOS Token:  1
PAD Token:  0


In [None]:
embedding_dim = 300
hidden_dim = 512
output_dim = len(summary_vocab)
n_layers = 2
dropout = 0.30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Unidirectional

In [None]:
class Seq2SeqModel(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, dropout, embedding_layer):
    super().__init__()

    self.output_dim = output_dim
    self.embedding = embedding_layer
    self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
    self.decoder = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
    self.W1 = nn.Linear(hidden_dim, hidden_dim)
    self.W2 = nn.Linear(hidden_dim, hidden_dim)
    self.V = nn.Linear(hidden_dim, 1)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, source, target, teacher_forcing_ratio=0.5):
    batch_size = source.shape[0]
    target_len = target.shape[1]
    target_vocab_size = self.output_dim
    '''
          source shape: (batch_size, seq_len)
          if i keep this shape, I was getting an error that hidden[0] expects (n_layers, batch_size, hidden_dim) but got (n_layers, seq_len, hidden_dim)
          so in order to get the correct hidden shape I transposed this
    '''
    embedded =  self.embedding(source.transpose(0, 1))
    encoder_outputs, (hidden, cell) = self.encoder(embedded)
    decoder_input = torch.tensor([[SOS_token] * batch_size], device=device)
    decoder_hidden = hidden
    decoder_cell = cell
    outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(device)
    attentions = torch.zeros(batch_size, target_len, source.shape[1]).to(device)
    for t in range(target_len):
      embedded = self.embedding(decoder_input)
      decoder_hidden_last = decoder_hidden[-1].unsqueeze(0).expand_as(encoder_outputs)
      score = self.V(torch.tanh(self.W1(encoder_outputs) + self.W2(decoder_hidden_last)))
      attention_weights = F.softmax(score, dim=1)
      context_vector = torch.bmm(attention_weights.permute(1, 2, 0), encoder_outputs.permute(1, 0, 2))
      context_vector = context_vector.permute(1, 0, 2)
      embedded_with_context = torch.cat((embedded, context_vector), dim=2)
      decoder_output, (decoder_hidden, decoder_cell) = self.decoder(embedded_with_context, (decoder_hidden, decoder_cell))
      output = self.fc(decoder_output.squeeze(0))
      outputs[:, t, :] = output
      attentions[:, t, :] =  attention_weights.squeeze(-1).transpose(0, 1)
      teacher_force = random.random() < teacher_forcing_ratio
      top1 = output.argmax(1)
      decoder_input = (target[:, t] if teacher_force else top1).unsqueeze(0)
    return outputs, attentions


Bidirectional

In [None]:
class Seq2SeqModel(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, dropout, embedding_layer):
    super().__init__()

    self.output_dim = output_dim
    self.embedding = embedding_layer
    self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, bidirectional = True)
    self.decoder = nn.LSTM(embedding_dim + 2 * hidden_dim, 2 * hidden_dim, num_layers=n_layers, dropout=dropout)
    self.W1 = nn.Linear(2 * hidden_dim, 2 * hidden_dim)
    self.W2 = nn.Linear(2 * hidden_dim, 2* hidden_dim)
    self.V = nn.Linear(2* hidden_dim, 1)
    self.fc = nn.Linear(2*hidden_dim, output_dim)

  def forward(self, source, target, teacher_forcing_ratio=0.5):
    batch_size = source.shape[0]
    target_len = target.shape[1]
    target_vocab_size = self.output_dim
    embedded =self.embedding(source.transpose(0, 1))
    encoder_outputs, (hidden, cell) = self.encoder(embedded)
    hidden = hidden.view(n_layers, 2, batch_size, hidden_dim)
    cell = cell.view(n_layers, 2, batch_size, hidden_dim)
    decoder_input = torch.tensor([[SOS_token] * batch_size], device=device)
    decoder_hidden = torch.cat((hidden[:, 0, :, :], hidden[:, 1, :, :]), dim=2)
    decoder_cell = torch.cat((cell[:, 0, :, :], cell[:, 1, :, :]), dim=2)
    outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(device)
    attentions = torch.zeros(batch_size, target_len, source.shape[1]).to(device)
    for t in range(target_len):
      embedded = self.embedding(decoder_input)
      decoder_hidden_last = decoder_hidden[-1].unsqueeze(0).expand_as(encoder_outputs)
      score = self.V(torch.tanh(self.W1(encoder_outputs) + self.W2(decoder_hidden_last)))
      attention_weights = F.softmax(score, dim=1)
      context_vector = torch.bmm(attention_weights.permute(1, 2, 0), encoder_outputs.permute(1, 0, 2))
      context_vector = context_vector.permute(1, 0, 2)
      embedded_with_context = torch.cat((embedded, context_vector), dim=2)
      decoder_output, (decoder_hidden, decoder_cell) = self.decoder(embedded_with_context, (decoder_hidden, decoder_cell))
      output = self.fc(decoder_output.squeeze(0))
      outputs[:, t, :] = output
      attentions[:, t, :] =  attention_weights.squeeze(-1).transpose(0, 1)
      teacher_force = random.random() < teacher_forcing_ratio
      top1 = output.argmax(1)
      decoder_input = (target[:, t] if teacher_force else top1).unsqueeze(0)
    return outputs, attentions

In [None]:
model = Seq2SeqModel(embedding_dim, hidden_dim, output_dim, n_layers, dropout, embedding_layer).to(device)

## Loss Function and Optimizer

In [None]:
criterion =  nn.CrossEntropyLoss(ignore_index=PAD_token)
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Training Loop

In [None]:
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    for i, (source, target) in enumerate(train_loader):
        source = source.to(device)
        target = target.to(device)
        optimizer.zero_grad()
        outputs, attentions = model(source, target, teacher_forcing_ratio=0.5)
        output_dim = outputs.shape[-1]
        outputs = outputs.view(-1, output_dim)
        target = target.view(-1)
        loss = criterion(outputs, target)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if i % 641 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
        if loss.item() <= 0.005:
            print(f"Early stopping at epoch {epoch+1}, step {i+1} due to loss {loss.item():.4f} <= 0.005")
            break
    else:
        continue
    break

In [None]:
torch.save(model.state_dict(), 'e_seq2seq_30.pth')

## Model Loading

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/CSE400 Dataset/e_seq2seq_v2_100.pth'))

## Model Evaluation

In [None]:
model.eval()

Seq2SeqModel(
  (embedding): Embedding(181521, 300)
  (encoder): LSTM(300, 512, num_layers=2, dropout=0.3)
  (decoder): LSTM(812, 512, num_layers=2, dropout=0.3)
  (W1): Linear(in_features=512, out_features=512, bias=True)
  (W2): Linear(in_features=512, out_features=512, bias=True)
  (V): Linear(in_features=512, out_features=1, bias=True)
  (fc): Linear(in_features=512, out_features=39133, bias=True)
)

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from rouge import Rouge
def calculate_scores(references, predictions):
    rouge = Rouge()
    bleu_scores = []
    rouge_scores = {'rouge-1': {'F-1 Score': [], 'Precision': [], 'Recall': []},
                    'rouge-2': {'F-1 Score': [], 'Precision': [], 'Recall': []},
                    'rouge-l': {'F-1 Score': [], 'Precision': [], 'Recall': []}}
    smoothie = SmoothingFunction().method4
    for ref, pred in zip(references, predictions):
        bleu_score = sentence_bleu([ref], pred, weights=(0.8, 0.2, 0, 0), smoothing_function=smoothie)
        bleu_scores.append(bleu_score)
        rouge_score = rouge.get_scores(' '.join(pred), ' '.join(ref), avg=True)
        for key in rouge_scores.keys():
            for metric, label in zip(['f', 'p', 'r'], ['F-1 Score', 'Precision', 'Recall']):
                rouge_scores[key][label].append(rouge_score[key][metric])
    return bleu_scores, rouge_scores


references = []
predictions = []

with torch.no_grad():
    for articles, summaries in test_loader:
        articles = articles.to(device)
        summaries = summaries.to(device)
        outputs, _ = model(articles, summaries)
        predicted_indices = outputs.argmax(dim=-1)
        for i in range(predicted_indices.shape[0]):
            references.append([summary_itos[idx] for idx in summaries[i]])
            prediction = []
            for idx in predicted_indices[i]:
                if idx == summary_stoi["<eos>"]:
                    break
                prediction.append(summary_itos[idx])
            if prediction[-1] != "<eos>":
                prediction.append("<eos>")
            predictions.append(prediction)

bleu_scores, rouge_scores = calculate_scores(references, predictions)

print("Average BLEU score:", sum(bleu_scores) / len(bleu_scores))
print("Average ROUGE scores:")
for key in rouge_scores.keys():
    print(f"{key}:")
    for label in ['F-1 Score', 'Precision', 'Recall']:
        print(f"  {label}: {sum(rouge_scores[key][label]) / len(rouge_scores[key][label])}")

In [None]:
def generate_summary(input_text):
    input_text = clean_text(input_text)
    tokens = word_tokenize(input_text)
    tokens = [token for token in tokens if token not in stop_words]
    sequence = pad_tokens(tokens, article_stoi, 288)
    input_tensor = torch.tensor(sequence).unsqueeze(0).to(device)
    batch_size = input_tensor.shape[0]
    decoder_input = torch.tensor([SOS_token] * batch_size).unsqueeze(1).to(device)
    output_indices = []
    attentions = []
    for _ in range(15):
        with torch.no_grad():
            outputs, attention = model(input_tensor, decoder_input)
        predicted_indices = outputs[:, -1, :].argmax(dim=-1).unsqueeze(1)
        output_indices.append(predicted_indices)
        attentions.append(attention)
        decoder_input = torch.cat((decoder_input, predicted_indices), dim=1)
        if predicted_indices[0][0] == EOS_token:
            break
    output_indices = torch.cat(output_indices, dim=1)
    predicted_words = []
    for index in output_indices[0]:
        if index == SOS_token or index == PAD_token:
            continue
        elif index == EOS_token:
            break
        elif index == UNK_token:
            predicted_words.append("<unk>")
        else:
            predicted_words.append(summary_itos[index])
    #predicted_words = set(predicted_words)
    summary = ' '.join(predicted_words)
    return summary

iter_test_loader = iter(test_loader)
for i in range(3):
  first_batch = next(iter_test_loader)
X_test, y_test = first_batch[0][0], first_batch[1][0]
X_test_tokens = [article_itos[idx] for idx in X_test if article_itos[idx] != "<pad>"]
y_test_tokens = [summary_itos[idx] for idx in y_test if summary_itos[idx] != "<pad>"]
input_text = ' '.join(X_test_tokens)
contents = ' '.join(y_test_tokens)
print("Input Text: " + input_text)
print("Sample Summary: " + contents)
print("Generated Summary: " + generate_summary(input_text))