Load Dataset from Colab

XSum dataset in google drive folder is generated using this documentation: https://github.com/EdinburghNLP/XSum/blob/master/XSum-Dataset/README.md

In [None]:
!pip install --upgrade --no-cache-dir gdown
!gdown https://drive.google.com/uc?id=1qgnZ-_N60Wd8LBTID-JYTxo7AfTW6a_C

!unzip -q -o xsum-extracts-from-downloads.zip
!rm -rf __MACOSX/

Downloading...
From: https://drive.google.com/uc?id=1qgnZ-_N60Wd8LBTID-JYTxo7AfTW6a_C
To: /content/xsum-extracts-from-downloads.zip
100% 276M/276M [00:01<00:00, 158MB/s]


Loading/Cleaning X-sum data

In [None]:
import spacy

punctuation_list = [char for char in '''!()-[]{};:'"\,<>./?@#$%^&*_~''']
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])    
def preprocess_article_texts(texts):
    tokenized_texts = []
    i = 0
    for text in nlp.pipe(texts):
        tokenized_text = [token.text for token in text if not token.is_stop]
        
        # remove punctuation
        tokenized_text = [token for token in tokenized_text if token not in punctuation_list]
        #print(text)
        #print(tokenized_text)
        tokenized_texts.append(tokenized_text)
        i += 1
        if i % 10000 == 0:
            print(i)
    return tokenized_texts

def preprocess_summary_texts(texts):
    tokenized_texts = []
    i = 0
    for text in nlp.pipe(texts):
        tokenized_text = [token.text for token in text]

        tokenized_texts.append(tokenized_text)
        i += 1
        if i % 10000 == 0:
            print(i)
    return tokenized_texts

In [None]:
# loading data
import os
data_dir = "xsum-extracts-from-downloads/"

summary_list = []
article_text_list = []
data_files = os.listdir(data_dir)
for i, data_file in enumerate(data_files):
    if i < 1000:
      with open(os.path.join(data_dir, data_file)) as file:
          lines = file.readlines()
          summary = lines[lines.index('[XSUM]INTRODUCTION[XSUM]\n') + 1].replace('\n', '').lower()

          article_start_idx = lines.index('[XSUM]RESTBODY[XSUM]\n') + 1
          split_article_text = [line.replace('\n', '').lower() for line in lines[article_start_idx:]]
          if split_article_text[0] == 'share this with':
              try:
                  index = split_article_text.index('these are external links and will open in a new window')
                  split_article_text = split_article_text[index + 1:]
              except:
                  try:
                      index = split_article_text.index('copy this link')
                      split_article_text = split_article_text[index + 1:]
                  except:
                      print("Couldn't find ending of article " + str(i))
                      print(split_article_text)
          article_text = ' '.join(split_article_text)
          summary_list.append(summary)
          article_text_list.append(article_text)

article_text_list = preprocess_article_texts(article_text_list)
summary_list = preprocess_summary_texts(summary_list)
print(len(summary_list), len(article_text_list))

1000 1000


In [None]:
from tqdm import tqdm

import numpy as np

# create a vocabulary
vocab_freq = {}
for article_text in tqdm(article_text_list, position=0, leave=True):
    for word in article_text:
        if word in vocab_freq:
            vocab_freq[word] += 1
        else:
            vocab_freq[word] = 1

print("Number of words total: " + str(len(vocab_freq)))
words = np.array([])
freqs = np.array([])

for key, value in vocab_freq.items():
    words = np.append(words, key)
    freqs = np.append(freqs, value)

index = np.argsort(freqs)[::-1]
words = words[index]
freqs = freqs[index]
print(words)

# take the top vocab_size freq words
vocab_size = 1000
words = words[:1000]

# assign a new index to each word
word_to_idx = {'[PAD]': 0, '[SOS]': 1, '[EOS]': 2, '[OOV]': 3}
for i, word in enumerate(words):
    word_to_idx[word] = i + 4
idx_to_word = {value: key for key, value in word_to_idx.items()}

100%|██████████| 1000/1000 [00:00<00:00, 5333.19it/s]


Number of words total: 25303
['said' 'mr' 'people' ... 'wean' 'â£19' 'succumbed']


In [None]:
# use word_to_idx to convert tokens to idx
article_tokenized_list = []
for i in range(len(article_text_list)):
    article_tokenized_list.append([word_to_idx[word] if word in word_to_idx else word_to_idx['[OOV]'] for word in article_text_list[i]])
print(article_tokenized_list[:10])

summary_tokenized_list = []
for i in range(len(summary_list)):
    summary_tokenized_list.append([word_to_idx[word] if word in word_to_idx else word_to_idx['[OOV]'] for word in summary_list[i]])
print(summary_tokenized_list[:10])

[[79, 319, 206, 229, 3, 725, 3, 598, 707, 3, 3, 3, 272, 261, 75, 3, 3, 3, 3, 3, 3, 119, 3, 3, 3, 793, 3, 609, 3, 68, 32, 7, 4, 3, 3, 17, 180, 3, 3, 524, 725, 3, 3, 738, 160, 3, 3, 3, 3, 3, 660, 3, 3, 3, 433], [3, 3, 3, 3, 111, 3, 137, 3, 3, 3, 3, 139, 67, 351, 68, 3, 78, 3, 3, 126, 987, 66, 903, 3, 3, 461, 554, 4, 3, 17, 344, 3, 3, 67, 344, 139, 3, 3, 3, 137, 3, 67, 139, 139, 45, 174, 71, 782, 137, 68, 3, 3, 3, 3, 3, 78, 47, 337, 3, 3, 149, 493, 3, 137, 496, 171, 461, 554, 4, 3, 111, 3, 428, 3, 3, 3, 653, 428, 51, 3, 3, 3, 194, 3, 370, 3, 3, 3, 3, 3, 3, 3, 20, 3, 78, 3, 3, 410, 289, 3, 3, 3, 428, 3, 3, 3, 3, 3, 3, 777, 3, 3, 3, 171, 544, 63, 3, 3, 3, 3, 420, 67, 3, 3, 3, 199, 3, 3, 420, 67, 4, 3, 3, 3, 67, 31, 3, 84, 3, 579, 193, 384, 3, 114, 541, 370, 156, 6, 3, 3], [3, 637, 3, 437, 116, 3, 151, 3, 3, 322, 3, 903, 765, 154, 3, 3, 3, 592, 685, 8, 210, 3, 3, 62, 3, 3, 3, 14, 3, 702, 3, 3, 3, 130, 927, 3, 151, 3, 110, 3, 29, 565, 279, 210, 3, 3, 3, 3, 3, 62, 469, 814, 3, 3, 3, 3, 3, 3, 1

In [None]:
# Finds the length of the longest summary and the length of the longest article

longestSummary = 0
longestText = 0
print(len(summary_list), len(article_text_list))
for summary in summary_list:
  longestSummary = max(longestSummary, len(summary))
  if len(summary) == 55:
    print(summary)
for article_text in article_text_list:
  longestText = max(longestText, len(article_text))
  if len(article_text) == 16000:
    print(article_text)
#Longest summary = 410, longest text = 873
print(longestSummary, longestText)

1000 1000
58 2010


In [None]:
# Make all summaries length 190 and all texts length 873

#for summary in summary_list:
  #while len(summary) < 55:
   # summary.append(0)
short_article_list = []
for article_text in article_text_list:
  short_article_list.append(article_text)
print(len(article_text_list))

print(article_text_list[1])

1000
['hunter', 'foundation', 'donating', 'table', '10', 'secondary', 'school', 'enter', 'pupils', 'write', 'essay', 'change', 'scotland', 'return', 'win', 'table', 'president', 'lecture', 'edinburgh', 'international', 'conference', 'centre', '26', 'invitation', 'businessman', 'sir', 'tom', 'said', 'ruled', 'world', '...', 'oh', 'okay', 'scotland', '...', 'change', 'write', 'essay', 'summarises', 'school', 'ran', 'scotland', 'change', 'change', 'think', 'run', 'country', 'tell', 'school', 'win', 'table', 'dinner', 'barack', 'obama', '44th', 'president', 'united', 'states', 'address', 'philanthropy', 'business', 'leaders', 'dinner', 'school', 'chance', 'event', 'sir', 'tom', 'said', 'table', '10', 'offered', 'winning', 'pupil', 'select', 'table', 'member', 'winning', 'group', 'teacher', 'adult', 'permission', 'given', 'pupils', 'schools', 'enter', 'essays', 'maximum', 'pages', 'pupils', 'encouraged', 'creative', 'like', 'prize', 'president', 'obama', 'lecture', 'include', 'post', 'lectu

Create Dataset Class

In [None]:
import torch

import torch.nn as nn
import torch.nn.functional as F

# Based on https://towardsdatascience.com/how-to-use-datasets-and-dataloader-in-pytorch-for-custom-text-data-270eed7f7c00
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class NewsDataset(Dataset):
  def __init__(self, text, summaries):
    self.text = text
    self.summaries = summaries
    self.tensors = None
  
# Length of dataset
def __len__(self):
  return len(self.summaries)

# Retrieves a specific item from the dataset
def __getItem__(self, idx):
  summaryTensor = tf.Tensor(summaries[i], shape=(1, len(summaries[0])))
  textTensor = tf.Tensor(text[i], shape=(1, len(text[0])))
  return (summaryTensor, textTensor)

# create Pandas DataFrame
#text_labels_df = pd.DataFrame({'Text': text, 'Labels': summaries})
# define data set object
#ND = NewsDataset(text_labels_df['Text'], text_labels_df['Labels'])

Define model

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, input_size)
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
    
    # input: (batch_size, seq_length)
    def forward(self, input, hidden):
        embedded = self.embedding(input) # embedded: (batch_size, seq_length, input_size)

        output, hidden = self.rnn(embedded, hidden)
        return output, hidden # output: (batch_size, seq_length, hidden_size)
    
    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, input_size)

        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)

    # input: (batch_size, seq_length)
    def forward(self, input, hidden):
        embedded = self.embedding(input) # embedded: (batch_size, seq_length, input_size)

        output = F.relu(embedded)
        output, hidden = self.rnn(output, hidden) # output: (batch_size, seq_length, hidden_size)

        output = self.softmax(self.out(output))
        return output, hidden # output: (batch_size, seq_length, vocab_size)
        
    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

In [None]:
# test encoder-decoder shapes
vocab_size = 100
global_input_size = 30
global_hidden_size = 50
batch_size = 16

encoder = EncoderRNN(vocab_size, global_input_size, global_hidden_size)
decoder = DecoderRNN(vocab_size, global_input_size, global_hidden_size)

encoder_input_tensor = torch.ones((batch_size, 80), dtype=int) # (batch_size, seq_length)
encoder_out = encoder(encoder_input_tensor, encoder.initHidden(batch_size))
print("encoder output shape: " + str(encoder_out[0].shape))
print("encoder hidden shape: " + str(encoder_out[1].shape))
print()

decoder_input_tensor = torch.ones((batch_size, 80), dtype=int) # (batch_size, seq_length)
decoder_out = decoder(decoder_input_tensor, encoder_out[1])
print("decoder output shape: " + str(decoder_out[0].shape))
print("decoder hidden shape: " + str(decoder_out[1].shape))

encoder output shape: torch.Size([16, 80, 50])
encoder hidden shape: torch.Size([1, 16, 50])

decoder output shape: torch.Size([16, 80, 100])
decoder hidden shape: torch.Size([1, 16, 50])


Create Training Loop

In [None]:
def tensorFromSentence(lang, sentence):
    indexes = [word_to_inx[word] for word in sentence.split(' ')]
    indexes.append(word_to_inx['EOS'])
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)

def tensorsFromPair(lang, pair):
    input_tensor = tensorFromSentence(lang, pair[0])
    target_tensor = tensorFromSentence(lang, pair[1])
    return (input_tensor, target_tensor)

def generate_data_indexes(data_length):
    indexes = [i for i in range(data_length)]
    random.shuffle(indexes)

    # 80:20:0 train validation test split
    train_idx = int(data_length * 0.8)
    val_idx = train_idx + int(data_length * 0.2)
    return indexes[:train_idx], indexes[train_idx:val_idx], indexes[val_idx:]

def trainIter(input_tensor, target_tensor, encoder, decoder,
      encoder_optimizer, decoder_optimizer, criterion):

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0
    encoder_hidden = encoder.initHidden()

    # ignorig encoder_output for now, will add attention later
    # encoder_hidden stores final result from encoder
    for ei in range(input_length):
      encoder_output, encoder_hidden = encoder.forward(
        input_tensor[ei], encoder_hidden)
    decoder_hidden = encoder_hidden
    decoder_input = torch.tensor([[word_to_inx['SOS']]])
    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder.forward(decoder_input,
                                                            decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]
    else:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder.forward(decoder_input,
                                                             decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])

            topt, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            
            if decoder_input.item() == word_to_inx['EOS']:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

def train(encoder, decoder, pair_percent=0.1, print_every=500, learning_rate=0.01):

    print('Training in progress...')

    num_iters = int(pair_percent * len(training_indexes))
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(all_pairs[i])
                      for i in training_indexes[:num_iters]]
    criterion = nn.CrossEntropyLoss()

    loss_to_print = 0

    for cur_iter in range(num_iters):
        training_pair = training_pairs[cur_iter]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        cur_loss = trainIter(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        total_loss_to_print += cur_loss

        if (cur_iter + 1) % print_every == 0:
            avg_loss_to_print = total_loss_to_print / print_every
            total_loss_to_print = 0
            print('%d% done: avg_loss = %.4f' % ((cur_iter + 1) / num_iters * 100,
                                                 avg_loss_to_print))



Create Eval Loop

Create Inference Function