In [None]:
# References:
# https://towardsdatascience.com/text-summarization-from-scratch-using-encoder-decoder-network-with-attention-in-keras-5fa80d12710e - Why both encoder
# and decoder for text summarization
# https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb - Majority of the source code
# https://towardsdatascience.com/transformers-explained-visually-part-1-overview-of-functionality-95a6dd460452 - Inference algo
# https://towardsdatascience.com/transformers-explained-visually-part-2-how-it-works-step-by-step-b49fa4a64f34 - Visual explanation


In [None]:
# importing libraries and setting device type

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import math
import copy
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# mounting drive

from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

%cd '/content/drive/MyDrive/AML_final_project'

Mounted at /content/drive/
/content/drive/.shortcut-targets-by-id/1kYtz2nPcpky_CSkXMRMwo7J9aQIQpSEz/AML_final_project


# Architecture

In [None]:
# defining class for multi-head attention

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [None]:
# defining feedforward layer

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [None]:
# defining positional encoding

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [None]:
# defining the encoder class

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [None]:
# defining the decoder class

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [None]:
# merging it all together

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device)
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
      src_mask, tgt_mask = self.generate_mask(src, tgt)
      src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
      tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

      enc_output = src_embedded
      for enc_layer in self.encoder_layers:
          enc_output = enc_layer(enc_output, src_mask)

      dec_output = tgt_embedded
      for dec_layer in self.decoder_layers:
          dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

      output = self.fc(dec_output)
      return output

# Data Prep

In [None]:
# reading in full data

full_data = pd.read_excel('Inshorts_Cleaned_Data.xlsx')
full_data.head()

Unnamed: 0,Headline,Short,Source,Time,Publish Date
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...,The New Indian Express,09:25:00,2017-03-26
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...,Outlook,22:18:00,2017-03-25
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a...",Hindustan Times,23:39:00,2017-03-25
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...,Livemint,23:08:00,2017-03-25
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...,YouTube,23:24:00,2017-03-25


In [None]:
## preprocessing

contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}


nltk.download('stopwords')    # downloading stopwords

stop_words = stopwords.words('english')

# function to carry out preprocessing
def preprocess(text):
    text = text.lower() # lowercase
    text = text.split() # convert have'nt -> have not
    for i in range(len(text)):
        word = text[i]
        if word in contraction_mapping:
            text[i] = contraction_mapping[word]
    text = " ".join(text)
    text = text.split()
    newtext = []
    for word in text:
        if word not in stop_words:
            newtext.append(word)
    text = " ".join(newtext)
    text = text.replace("'s",'') # convert your's -> your
    text = re.sub(r'\(.*\)','',text) # remove (words)
    text = re.sub(r'[^a-zA-Z0-9. ]','',text) # remove punctuations
    text = re.sub(r'\.',' . ',text)
    return text

# executing preprocessing
full_data['Headline'] = full_data['Headline'].apply(lambda x:preprocess(x))
full_data['Short'] = full_data['Short'].apply(lambda x:preprocess(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# splitting into train, validation and test

train_df, validate_df, test_df = \
              np.split(full_data.sample(frac=1, random_state=42), 
                       [int(.6*len(full_data)), int(.8*len(full_data))])
              
# resetting index for newly created df's
train_df = train_df.reset_index()
validate_df = validate_df.reset_index()
test_df = test_df.reset_index()


In [None]:
# defining classes and functions to prep the data

SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


def readLangs(text, summary, reverse=False):
    print("Reading lines...")
    
    # Split every line into pairs and normalize
    pairs = [[text[i],summary[i]] for i in range(len(text))]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(summary)
        output_lang = Lang(text)
    else:
        input_lang = Lang(text)
        output_lang = Lang(summary)

    return input_lang, output_lang, pairs


def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    # print("Counted words:")
    # print(input_lang.name, input_lang.n_words)
    # print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair, input_lang, output_lang):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)


# setting max sequence lengths for padding purposes (IMP: These are arbitrary and need to be estimated using percentile analysis)
max_src_length = 90 
max_tgt_length = 15

### Training data prep

In [None]:
x = train_df['Short']
y = train_df['Headline']
train_input_lang, train_output_lang, train_pairs = prepareData(x, y ,False)
print(random.choice(train_pairs))

Reading lines...
Read 33062 sentence pairs
Counting words...
['coming home leg champions league round 16 20 lead barcelona defeated arsenal 31 wednesday advance quarterfinals tournament 51 aggregate score .  barcelona forward trio neymar luis surez lionel messi scored one each mohamed elnenny sole scorer gunners . ', 'barcelona advance champions league quarters']


In [None]:
# creating training pairs in tensor form
tensor_pairs = [tensorsFromPair(pair, train_input_lang, train_output_lang) for pair in train_pairs]

# creating source data after padding
src_data = torch.empty((1, max_src_length), dtype=int, device=device)
for pair in tensor_pairs:
  src_tensor = pair[0]
  pad_len = max_src_length - src_tensor.shape[0]
  padded_src = F.pad(input=src_tensor, pad=(0, 0, 0, pad_len),
                     mode='constant', value=0).reshape(1, max_src_length)
  src_data = torch.cat((src_data, padded_src))                

src_data = src_data[1:,:]   # final version of source data tensors

# creating target data after padding
tgt_data = torch.empty((1, max_tgt_length), dtype=int, device=device)
for pair in tensor_pairs:
  tgt_tensor = pair[1]
  pad_len = max_tgt_length - tgt_tensor.shape[0]
  padded_tgt = F.pad(input=tgt_tensor, pad=(0, 0, 0, pad_len),
                     mode='constant', value=0).reshape(1, max_tgt_length)
  tgt_data = torch.cat((tgt_data, padded_tgt))                

tgt_data = tgt_data[1:,:]   # final version of target data tensors

# cross-checking shapes
print("Shape of source data tensor matrix is", src_data.shape)
print("Shape of target data tensor matrix is", tgt_data.shape)

Shape of source data tensor matrix is torch.Size([33062, 90])
Shape of target data tensor matrix is torch.Size([33062, 15])


In [None]:
# creating custom dataset for training data

class NewsDataset(Dataset):
    def __init__(self, src_data, tgt_data):
        self.src_data = src_data
        self.tgt_data = tgt_data
    def __getitem__(self, idx):
        src = self.src_data[idx]
        tgt = self.tgt_data[idx]      
        return src, tgt
    def __len__(self):
        return len(self.src_data)

# instantiating
train_iter = NewsDataset(src_data, tgt_data)


In [None]:
# creating dataloader

batch_size = 16

# instantiate dataloader objects for training data
train_dataloader = DataLoader(train_iter, batch_size=batch_size)

# testing dataloader
for idx, (src, tgt) in enumerate(train_dataloader):
    print(idx, src.shape, tgt.shape)
    if idx == 4: 
        break

0 torch.Size([16, 90]) torch.Size([16, 15])
1 torch.Size([16, 90]) torch.Size([16, 15])
2 torch.Size([16, 90]) torch.Size([16, 15])
3 torch.Size([16, 90]) torch.Size([16, 15])
4 torch.Size([16, 90]) torch.Size([16, 15])


### Validation data prep

In [None]:
x = validate_df['Short']
y = validate_df['Headline']
val_input_lang, val_output_lang, val_pairs = prepareData(x, y ,False)
print(random.choice(val_pairs))

Reading lines...
Read 11021 sentence pairs
Counting words...
['swedish automaker volvo39s custombuilt truck 39the iron knight39 become world39s fastest truck covering kilometre 21 . 29 seconds .  truck achieved average speed 169 kmph powered ishift dual clutch transmission .  also became world39s fastest truck cover 500metre distance 13 . 71 seconds . ', 'world39s fastest truck covers 1 kilometre 21 . 2 secs']


In [None]:
# creating validation pairs in tensor form
tensor_pairs = [tensorsFromPair(pair, val_input_lang, val_output_lang) for pair in val_pairs]

# creating source data after padding
src_data = torch.empty((1, max_src_length), dtype=int, device=device)
for pair in tensor_pairs:
  src_tensor = pair[0]
  pad_len = max_src_length - src_tensor.shape[0]
  padded_src = F.pad(input=src_tensor, pad=(0, 0, 0, pad_len),
                     mode='constant', value=0).reshape(1, max_src_length)
  src_data = torch.cat((src_data, padded_src))                

src_data = src_data[1:,:]   # final version of source data tensors

# creating target data after padding
tgt_data = torch.empty((1, max_tgt_length), dtype=int, device=device)
for pair in tensor_pairs:
  tgt_tensor = pair[1]
  pad_len = max_tgt_length - tgt_tensor.shape[0]
  padded_tgt = F.pad(input=tgt_tensor, pad=(0, 0, 0, pad_len),
                     mode='constant', value=0).reshape(1, max_tgt_length)
  tgt_data = torch.cat((tgt_data, padded_tgt))                

tgt_data = tgt_data[1:,:]   # final version of target data tensors

# cross-checking shapes
print("Shape of source data tensor matrix is", src_data.shape)
print("Shape of target data tensor matrix is", tgt_data.shape)

Shape of source data tensor matrix is torch.Size([11021, 90])
Shape of target data tensor matrix is torch.Size([11021, 15])


In [None]:
# creating custom dataset for validation data

class NewsDataset(Dataset):
    def __init__(self, src_data, tgt_data):
        self.src_data = src_data
        self.tgt_data = tgt_data
    def __getitem__(self, idx):
        src = self.src_data[idx]
        tgt = self.tgt_data[idx]      
        return src, tgt
    def __len__(self):
        return len(self.src_data)

# instantiating
val_iter = NewsDataset(src_data, tgt_data)


In [None]:
# creating dataloader

batch_size = 16

# instantiate dataloader objects for validation data
val_dataloader = DataLoader(val_iter, batch_size=batch_size)

# testing dataloader
for idx, (src, tgt) in enumerate(val_dataloader):
    print(idx, src.shape, tgt.shape)
    if idx == 4: 
        break

0 torch.Size([16, 90]) torch.Size([16, 15])
1 torch.Size([16, 90]) torch.Size([16, 15])
2 torch.Size([16, 90]) torch.Size([16, 15])
3 torch.Size([16, 90]) torch.Size([16, 15])
4 torch.Size([16, 90]) torch.Size([16, 15])


### Test data prep

In [None]:
x = test_df['Short']
y = test_df['Headline']
test_input_lang, test_output_lang, test_pairs = prepareData(x, y ,False)
print(random.choice(test_pairs))

Reading lines...
Read 11021 sentence pairs
Counting words...
['reality television star kim kardashian denied reports new sex tape featuring leaked online saying 34it39s new it39s old tape . 34 report claimed video showed woman 34appearing kim34 bra showing cleavage man .  notably kim39s sex tape boyfriend leaked 2007 . ', 'kim kardashian denies rumours new sex tape leaked']


In [None]:
# creating test pairs in tensor form
tensor_pairs = [tensorsFromPair(pair, test_input_lang, test_output_lang) for pair in test_pairs]

# creating source data after padding
src_data = torch.empty((1, max_src_length), dtype=int, device=device)
for pair in tensor_pairs:
  src_tensor = pair[0]
  pad_len = max_src_length - src_tensor.shape[0]
  padded_src = F.pad(input=src_tensor, pad=(0, 0, 0, pad_len),
                     mode='constant', value=0).reshape(1, max_src_length)
  src_data = torch.cat((src_data, padded_src))                

src_data = src_data[1:,:]   # final version of source data tensors

# creating target data after padding
tgt_data = torch.empty((1, max_tgt_length), dtype=int, device=device)
for pair in tensor_pairs:
  tgt_tensor = pair[1]
  pad_len = max_tgt_length - tgt_tensor.shape[0]
  padded_tgt = F.pad(input=tgt_tensor, pad=(0, 0, 0, pad_len),
                     mode='constant', value=0).reshape(1, max_tgt_length)
  tgt_data = torch.cat((tgt_data, padded_tgt))                

tgt_data = tgt_data[1:,:]   # final version of target data tensors

# cross-checking shapes
print("Shape of source data tensor matrix is", src_data.shape)
print("Shape of target data tensor matrix is", tgt_data.shape)

Shape of source data tensor matrix is torch.Size([11021, 90])
Shape of target data tensor matrix is torch.Size([11021, 15])


In [None]:
# creating custom dataset for test data

class NewsDataset(Dataset):
    def __init__(self, src_data, tgt_data):
        self.src_data = src_data
        self.tgt_data = tgt_data
    def __getitem__(self, idx):
        src = self.src_data[idx]
        tgt = self.tgt_data[idx]      
        return src, tgt
    def __len__(self):
        return len(self.src_data)

# instantiating
test_iter = NewsDataset(src_data, tgt_data)


In [None]:
# creating dataloader

batch_size = 16

# instantiate dataloader objects for test data
test_dataloader = DataLoader(test_iter, batch_size=batch_size)

# testing dataloader
for idx, (src, tgt) in enumerate(test_dataloader):
    print(idx, src.shape, tgt.shape)
    if idx == 4: 
        break

0 torch.Size([16, 90]) torch.Size([16, 15])
1 torch.Size([16, 90]) torch.Size([16, 15])
2 torch.Size([16, 90]) torch.Size([16, 15])
3 torch.Size([16, 90]) torch.Size([16, 15])
4 torch.Size([16, 90]) torch.Size([16, 15])


## Model training

In [None]:
# evaluation functions for validation and test data

def evaluate(model, data_loader):
    '''
    Evaluate the model on the given data.
    '''

    model.eval()
    it = iter(data_loader)
    total_count = 0 # Number of target words seen
    total_loss = 0 # Loss over all target words
    with torch.no_grad():
        # No gradients need to be maintained during evaluation
        # There are no hidden tensors for the first batch, and so will default to zeros.
        for i, batch in enumerate(it):
            ''' Do the following:
                - Extract the text and target from the batch, and if using CUDA (essentially, using GPUs), place 
                  the tensors on cuda, using a commands such as "text = text.cuda()".  More details are at
                  https://pytorch.org/docs/stable/notes/cuda.html.
                - Pass the hidden state vector from output of previous batch as the initial hidden vector for
                  the current batch. 
                - Call forward propagation to get output and final hidden state vector.
                - Compute the cross entropy loss
                - The loss_fn computes the average loss per target word in the batch.  Count the number of target
                  words in the batch (it is usually the same, except for the last batch), and use it to track the 
                  total count (of target words) and total loss see so far over all batches.
            '''
            src, tgt = batch
            output = transformer(src, tgt[:, :-1])
            loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt[:, 1:].contiguous().view(-1))
            total_count += np.multiply(*src.size())
            total_loss += loss.item()*np.multiply(*src.size())

    loss = total_loss / total_count
    model.train()
    return loss

In [None]:
# instantiating parameters and transformer model instance

src_vocab_size = train_input_lang.n_words
tgt_vocab_size = train_output_lang.n_words
d_model = 512   # dimension of embedding
num_heads = 8   # number of attention heads
num_layers = 6  # number of encoder and decoder layers
d_ff = 2048     # dimension of feedforward layer input
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, 
                          num_layers, d_ff, max_src_length, dropout)
transformer = transformer.to(device)    # getting model to run on CUDA

In [None]:
# training the model

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
train_log_interval = 200
val_log_interval = 500

val_losses = []
best_model = None

for epoch in range(10):
  transformer.train()
  print("######### EPOCH {} ##########".format(epoch + 1))
  for idx, (src, tgt) in enumerate(train_dataloader):
    transformer.zero_grad()
    output = transformer(src, tgt[:, :-1])    # why removing the last column in target?
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()

    if (idx + 1) % train_log_interval == 0:
      print(f'At iteration {idx + 1} the TRAINING loss is {loss:.3f}.')

    if (idx + 1) % val_log_interval == 0:
      val_loss = evaluate(transformer, val_dataloader)
      val_losses.append(val_loss)
      print(f'At iteration {idx + 1} the VALIDATION loss is {val_loss:.3f}.')
      if val_loss <= min(val_losses):
          print("Best model changed at iteration", idx + 1)
          best_model = type(transformer)(src_vocab_size, tgt_vocab_size, d_model, num_heads, 
                        num_layers, d_ff, max_src_length, dropout) # get a new instance
          best_model.to(device=device)
          best_model.load_state_dict(transformer.state_dict()) # copy weights
  
  print()
  print(f"At end of Epoch: {epoch+1}, Training Loss: {loss.item()}")

  print()
  print()


######### EPOCH 1 ##########
At iteration 200 the TRAINING loss is 7.882.
At iteration 400 the TRAINING loss is 8.093.
At iteration 500 the VALIDATION loss is 8.801.
Best model changed at iteration 500
At iteration 600 the TRAINING loss is 8.018.
At iteration 800 the TRAINING loss is 7.313.
At iteration 1000 the TRAINING loss is 8.134.
At iteration 1000 the VALIDATION loss is 8.845.
At iteration 1200 the TRAINING loss is 7.764.
At iteration 1400 the TRAINING loss is 7.635.
At iteration 1500 the VALIDATION loss is 8.945.
At iteration 1600 the TRAINING loss is 8.160.
At iteration 1800 the TRAINING loss is 7.602.
At iteration 2000 the TRAINING loss is 7.734.
At iteration 2000 the VALIDATION loss is 8.996.

At end of Epoch: 1, Training Loss: 7.173739433288574


######### EPOCH 2 ##########
At iteration 200 the TRAINING loss is 6.882.
At iteration 400 the TRAINING loss is 7.375.
At iteration 500 the VALIDATION loss is 9.251.
At iteration 600 the TRAINING loss is 7.556.
At iteration 800 the 

In [None]:
# saving the best models to disk

torch.save(best_model.state_dict(), "/content/drive/MyDrive/AML_final_project/best_model_weights/best_transformer_val_model.pt")
torch.save(transformer.state_dict(), "/content/drive/MyDrive/AML_final_project/best_model_weights/best_transformer_train_model.pt")

# Model evaluation

In [None]:
# loading the trained model from disk

trained_model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, 
                          num_layers, d_ff, max_src_length, dropout)
trained_model = trained_model.to(device)
trained_model.load_state_dict(torch.load("/content/drive/MyDrive/AML_final_project/best_model_weights/best_transformer_train_model.pt"))



NameError: ignored

In [None]:
# defining functions for visual model evaluation

def evaluate(model, input_lang, output_lang, sentence, max_len = 15):
    '''
    Same as training procedure above, except there are no targets.
    '''
    with torch.no_grad():

      # prepping input sentence
      input_tensor = tensorFromSentence(input_lang, sentence)
      input_tensor = input_tensor.reshape(1, input_tensor.size()[0])

      decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
      decoded_words = []

      for di in range(max_len):
        output = model(input_tensor, decoder_input)

        # extracting vector corresponding to last word
        last = output.squeeze(0)[-1, :].unsqueeze(0)

        # doing softmax
        m = nn.Softmax(dim=1)
        output_probs = m(last)

        # extracting index with highest probability
        idx = torch.argmax(last, dim=1)
        print("index is", idx)
        if idx.item() == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[idx.item()])

        decoder_input = torch.cat((decoder_input, idx.unsqueeze(0)), 1).to(device)

      return decoder_input, decoded_words


def evaluateRandomly(model, input_lang, output_lang, pairs, n=5):
    '''
    Evaluate random sentences from the dataset and print out the text, actual
    summary and predicted summary to make some subjective quality judgements.
    '''
    for i in range(n):
        pair = random.choice(pairs)
        print('Text:', pair[0])
        print('Actual summary:', pair[1])
        decoder_input, decoded_words = evaluate(model, 
                                                input_lang,
                                                output_lang,
                                                pair[0])
        output_sentence = ' '.join(decoded_words)
        print('Predicted summary:', output_sentence)
        print('')

## Visual inspection on training data predictions



In [None]:
evaluateRandomly(trained_model, train_input_lang, train_output_lang, train_pairs)

Text: islamic state  acknowledged death 39jihadi john39 british maskedmilitant appearing videos depicting beheadings western hostages site intelligence group reported tuesday .  published eulogy englishlanguage magazine 39dabiq39 report added .  us military november claimed killed 39jihadi john39 dronestrike . 
Actual summary: acknowledges death jihadi john report
Predicted summary: app helps man killed syria <EOS>

Text: couple us state oklahoma clemma sterling elmore photoshoot inspired 2004 movie 39the notebook39 celebrate 57 years togetherness .  setting shoot complete vintage babyblue truck 1940sstyle clothing handwritten love notes .  photoshoot organised granddaughter done stacy welch christ . 
Actual summary: couple 39the notebook39 inspired photoshoot
Predicted summary: reelect house new year jail <EOS>

Text: actress aishwarya rai bachchan best actress award role 39sarbjit39 international film festival awards australia .  based life sarabjit singh indian prisoner pakistan fil

## Visual inspection on test data predictions

In [None]:
evaluateRandomly(trained_model, test_input_lang, test_output_lang, test_pairs)

Text: nita ambani indias first female member international olympic committee became countrys first woman award medals olympic prize distribution ceremony .  ambani gave away medals winners womens 400metre freestyle swimming event monday .  ambani elected ioc august 4 remain member till turns 70 . 
Actual summary: nita 1st indian woman award olympic medals
Predicted summary: university home wounded dimple grass <EOS>

Text: ban highdenomination currency led mismatch cash supply figures recent report suggesting people withdrawn 60000 crore actual currency circulation january 13 .  total currency circulation reported 9 . 1 lakh crore public already withdrawn close 9 . 7 lakh crore . 
Actual summary: cash withdrawn 60000 cr circulation report
Predicted summary: startups ipo spa irans olav exhusband <EOS>

Text: congress vice president rahul gandhi tuesday said pressure government decided rollback tax employees39 provident fund  withdrawals .  said 34whenever somebody oppressed wrongly vict

## Calculating BLEU score on test data predictions

In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# creating the necessary objects

predicted_summaries = []
actual_summaries = []

# actual summaries
for pair in test_pairs:
  actual_summaries.append(pair[1])

# predicted summaries
for pair in test_pairs:
  decoder_input, decoded_words = evaluate(trained_model, 
                                          test_input_lang,
                                          test_output_lang,
                                          pair[0])
  output_sentence = ' '.join(decoded_words)
  predicted_summaries.append(output_sentence)


index is tensor([773], device='cuda:0')
index is tensor([642], device='cuda:0')
index is tensor([357], device='cuda:0')
index is tensor([943], device='cuda:0')
index is tensor([1432], device='cuda:0')
index is tensor([1], device='cuda:0')
index is tensor([789], device='cuda:0')
index is tensor([642], device='cuda:0')
index is tensor([694], device='cuda:0')
index is tensor([2142], device='cuda:0')
index is tensor([3789], device='cuda:0')
index is tensor([1], device='cuda:0')
index is tensor([1649], device='cuda:0')
index is tensor([697], device='cuda:0')
index is tensor([3546], device='cuda:0')
index is tensor([1834], device='cuda:0')
index is tensor([61], device='cuda:0')
index is tensor([862], device='cuda:0')
index is tensor([240], device='cuda:0')
index is tensor([1], device='cuda:0')
index is tensor([4205], device='cuda:0')
index is tensor([530], device='cuda:0')
index is tensor([107], device='cuda:0')
index is tensor([397], device='cuda:0')
index is tensor([1294], device='cuda:0')

KeyError: ignored

In [None]:
# calculating the score

import evaluate

bleu = evaluate.load("google_bleu")
total_bleu_score = 0

for prediction, actual in zip(predicted_summaries, actual_summaries):
    total_bleu_score = total_bleu_score + bleu.compute(predictions=[prediction],
                                                       references=[actual])['google_bleu']

avg_bleu_score = total_bleu_score/len(predicted_summaries)
print("The average BLEU score is", avg_bleu_score)

The average BLEU score is 0.006912580596791122


In [None]:
predicted_summaries

['praneeth larger forgotten planet 20 <EOS>',
 'talks give free crew39s kohinoor <EOS>',
 'mushkil pact haters obamamichelle39s bombing scanning <EOS>',
 'university dimple gandhi footballer dutchman <EOS>',
 'venues 1 govt profit ae <EOS>',
 'quarterfinal dimple members gandhi footballer larger <EOS>',
 'undeclared jasprit stock bunting nationals tyson <EOS>',
 '32 depot netanyahu39s given playing film law offers <EOS>',
 'startups cancer fb trailer irans <EOS>',
 'larger listed tendulkar39s gandhi jackman39s <EOS>',
 'university dimple smartphones h1b bse power <EOS>',
 'kohinoor kerala gandhi spider centre bipasha <EOS>',
 'delayed gandhi spider fade clean energy <EOS>',
 'removed gandhi foreign jawan zurich <EOS>',
 'university university netanyahu39s paid reveals pok apologise 2919 <EOS>',
 'suspended scanning dimple russia basketball <EOS>',
 'rajya criticism39 rising feature cr sasikala rising <EOS>',
 'praneeth larger nonstop stage whatever hurdles <EOS>']