In [54]:
import re
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
    
import torch
from torch.utils import data

import math
from tqdm import tqdm
import time

import pandas as pd
from rouge import Rouge

In [2]:
# device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_TOKEN = 3
CONTENT_MAX_LENGTH = 100
TITLE_MAX_LENGTH = 8

class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {"SOS": 0, "EOS": 1, "PAD": 2, 'UNK':3}
        self.index2word = {0: "SOS", 1: "EOS", 2: "PAD", 3: 'UNK'}
        self.n_words = 3
        self.word2count = {}
    
    def add_sentence(self, sentence):
        for word in sentence.lower().split():
            self.add_word(word)
    
    def add_word(self, word):
        if word not in self.word2index.keys():
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word2count[word] = 1
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def to_json(self, file_path):
        pass
    
    def read_from_json(self, file_path):
        pass

In [4]:
train_df = pd.read_csv('../../courses/cse_842/bbc_data/train_split.csv')
test_df = pd.read_csv('../../courses/cse_842/bbc_data/test_split.csv')
train_df.shape, test_df.shape

((1977, 4), (349, 4))

In [5]:
train_df.head()

Unnamed: 0,file_path,class,title,content
0,/media/kuldeep/Work/college_stuff/courses/cse_...,entertainment,Elton plays Paris charity concert,Sir Elton John has performed at a special conc...
1,/media/kuldeep/Work/college_stuff/courses/cse_...,politics,Defiant hunts put ban to the test,Thousands of hunt supporters have been out on ...
2,/media/kuldeep/Work/college_stuff/courses/cse_...,sport,Injury doubts beset Wales squad,Wales have a clutch of injury worries before W...
3,/media/kuldeep/Work/college_stuff/courses/cse_...,business,Bombardier chief to leave company,Shares in train and plane-making giant Bombard...
4,/media/kuldeep/Work/college_stuff/courses/cse_...,entertainment,EastEnders 'is set for US remake',Plans to create a US soap based on the BBC's E...


In [6]:
def normalize_string(s):
    # add regex to remove urls and bractetted stuff.
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


def prepare_data(
    file_path, context_max_length=CONTENT_MAX_LENGTH, 
    title_max_length=TITLE_MAX_LENGTH
):
    df = pd.read_csv(file_path)
    pairs = []
    for _, row in df.iterrows():
        pairs.append(
            [
                row['title'], row['content']
            ]
        )
    print("{} titles and content read.".format(len(pairs)))
    pairs = [[normalize_string(p[0]), normalize_string(p[1])] for p in pairs]

    return pairs

def populate_vocab(vocab, pairs):
    for ti, co in pairs:
        vocab.add_sentence(co)
    return

In [7]:
train_pairs = prepare_data('../../courses/cse_842/bbc_data/train_split.csv')
print(random.choice(train_pairs))

1977 titles and content read.
['rapper kanye west s shrewd soul', 'us hip hop star kanye west who leads the race for this year s grammys with nominations rose to prominence by producing songs for artists such as jay z and alicia keys . he then emerged from his behind the scenes role to become an artist as well as a producer . but his solo career almost ended before it began after a near fatal car crash left west with his jaw wired shut in . the resulting song through the wire became west s first uk hit in april and subsequent album the college dropout became a transatlantic success both critically and commercially . west began rapping as a teenager at his chicago school inspired by the beats and rhymes of s pioneers run dmc . hip hop producer no id encouraged west to sample old soul and r b hits then revive them with an updated sound an approach that would become his trademark . i feel like a lot of the soul that s in those old records that i sample is in me he said . so when i hear th

In [7]:
class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, pairs, vocab, max_len_title, max_len_content):
        'Initialization'
        self.pairs = pairs
        self.max_len_title = max_len_title
        self.max_len_content = max_len_content
        self.vocab = vocab
        self.input_content = [tensorFromSentence(self.vocab, inp[1], self.max_len_content) for inp in self.pairs]
        self.output_title = [tensorFromSentence(self.vocab, inp[0], self.max_len_title) for inp in self.pairs]

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.pairs)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        # Load data and get label
        X = self.input_content[index]
        y = self.output_title[index]

        return X, y

def indexesFromSentence(vocab, sentence, max_len):
    l = [vocab.word2index.get(word, vocab.word2index['UNK']) for word in sentence.split()]
    if len(l) > max_len - 2:
        l = l[:max_len-2]
    
    l = [0] + l + [1] 
    if len(l) < max_len:
        for i in range(len(l), max_len):
            l.append(vocab.word2index["PAD"])
    
    return l


def tensorFromSentence(vocab, sentence, max_len):
    indexes = indexesFromSentence(vocab, sentence, max_len)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

In [8]:
# class EncoderRnn(nn.Module):
#     def __init__(self, vocab_size, hidden_size, embedding_dim, num_layers,
#                 dropout=0.2):
#         super(EncoderRnn, self).__init__()
#         self.hidden_size = hidden_size
#         self.vocab_size = vocab_size
#         self.num_layers = num_layers
#         self.dropout = dropout
        
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
#         self.gru = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, bidirectional=False, num_layers=self.num_layers)
        
#         self.dropout = nn.Dropout(self.dropout)
        
#     def forward(self, inp):
# #         print(inp.shape)
#         # [seq_len, batch_size]
#         inp = self.dropout(self.embeddings(inp))
# #         print(inp.shape)
#         # [seq_len, batch_size, embed_size]
#         output, (hidden, cell) = self.gru(inp)
# #         print(output.shape, hidden.shape)
#         return output, (hidden, cell)

# h_n.view(num_layers, num_directions, batch, hidden_size)

# class DecodeRnn(nn.Module):
#     def __init__(self, vocab_size, hidden_size, embedding_dim, num_layers,
#                 dropout=0.2):
#         super(DecodeRnn, self).__init__()
        
#         self.vocab_size = vocab_size
#         self.hidden_size = hidden_size
#         self.embedding_dim = embedding_dim
#         self.num_layers = num_layers
#         self.dropout = dropout
        
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
#         self.gru = nn.LSTM(embedding_dim, hidden_size,bidirectional=False, num_layers=self.num_layers)
        
#         self.linear = nn.Linear(hidden_size, vocab_size)
        
#         self.dropout = nn.Dropout(self.dropout)
    
#     def forward(self, inp, hidden, cell):
# #         print("inp shape {} and hidden shape {} is.".format(inp.shape, hidden.shape))
#         inp = inp.unsqueeze(0)
# #         print("after unsqueezing inp shape {} and hidden shape {} is.".format(inp.shape, hidden.shape))
        
#         # [1, batch_size]
#         embedded = self.dropout(self.embedding(inp))
# #         print("embbedded {} is.".format(embedded.shape))
        
#         # [1, batch_size, embedding_dim]
#         output, (hidden, cell) = self.gru(embedded, (hidden, cell))
# #         print("decoder output shape {} and hidden shape {} is.".format(output.shape, hidden.shape))
        
#         prediction = self.linear(output.squeeze(0))
# #         print("prediction shape {} is.".format(prediction.shape))
        
#         return prediction, (hidden, cell)

# # dec = DecodeRnn(output_lang.n_words, 32, 50, num_layers=2)

# # y.shape

# # dec_inp = y[0, :]
# # dec_inp.shape

# # dec_out, (dec_hidden, dec_cell) = dec(dec_inp, enc_hidden, enc_cell)

# class Seq2Seq(nn.Module):
#     def __init__(self, encoder, decoder, device):
#         super().__init__()
        
#         self.encoder = encoder
#         self.decoder = decoder
#         self.device = device
        
#         # hidden dim of encoder and decoder must be same 
#         assert encoder.hidden_size == decoder.hidden_size
    
#     def forward(self, src, trg, teacher_forcing_ratio=0.5):
#         # src = [src sent len, batch size]
#         # trg = [trg sent len, batch size]
        
#         batch_size = trg.shape[1]
#         max_len = trg.shape[0]
#         trg_vocab_size = self.decoder.vocab_size
# #         print("batch_size is {}, max_len is {}, and trg_vocab_size is {}".format(batch_size, max_len, trg_vocab_size))
        
#         # saving outputs from decode
#         outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
# #         print("Outputs initialised shape {}".format(outputs.shape))
        
#         # getting encoder outputs
#         enc_out, (decoder_hidden, decoder_cell) = self.encoder(src)
# #         print("Encoder output {} and hidden {}".format(enc_out.shape, decoder_hidden.shape))
        
#         # first input to decoder is always the <SOS> token
#         decoder_inp = trg[0, :]
#         outputs[0,:, 0] = 0.99
        
#         for t in range(1, max_len):
# #             print()
# #             print(t)
# #             print(decoder_inp)
#             decoder_out, (decoder_hidden, decoder_cell) = self.decoder(decoder_inp, decoder_hidden, decoder_cell)
#             outputs[t] = decoder_out
#             teacher_force = random.random() < teacher_forcing_ratio
#             top1 = decoder_out.max(1)[1]
#             decoder_inp = (trg[t] if teacher_force else top1)
#         return outputs

In [9]:
class EncoderRnn(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_hidden_state_size, dec_hidden_state_size, num_layers, dropout=0.5,
                bidirectional=True):
        super(EncoderRnn, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.enc_hidden_state_size = enc_hidden_state_size
        self.dec_hidden_state_size = dec_hidden_state_size
        self.num_layers=num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        
        self.rnn = nn.GRU(input_size=self.embedding_dim, hidden_size=self.enc_hidden_state_size,
                          num_layers=self.num_layers, dropout=self.dropout, bidirectional=self.bidirectional)
        
        self.combined_context_layer = nn.Linear(self.enc_hidden_state_size * 2, self.dec_hidden_state_size)
        
        self.dropout_layer = nn.Dropout(self.dropout)
    
    def forward(self, inp):        
        # inp = [sent_length, batch_size]

        embedded = self.dropout_layer(self.embedding(inp))
        # embedded = [sent_length, batch_size, embedding_dim]
        outputs, hidden = self.rnn(embedded)
        # outputs = [seq_len, batch, num_directions * hidden_size]
        # hidden = [num_layers*num_directions, batch, hidden_size]

        combined_context = torch.tanh(self.combined_context_layer(torch.cat((hidden[-2, : ,:], hidden[-1, :, :]), dim=1)))
        # combined_context = [batch_size, dec_hidden_state_size]
        
        return outputs, combined_context

Attention Layer

In [10]:
class AttentionLayer(nn.Module):
    def __init__(self, dec_hidden_state_size, enc_hidden_state_size):
        super(AttentionLayer, self).__init__()
        self.dec_hidden_state_size = dec_hidden_state_size
        self.enc_hidden_state_size = enc_hidden_state_size
        
        self.attn = nn.Linear((2 * self.enc_hidden_state_size) + self.dec_hidden_state_size, self.dec_hidden_state_size)
        
        self.v = nn.Parameter(torch.rand(self.dec_hidden_state_size))
        
    def forward(self, hidden, enc_outputs):
        # hidden = [batch_size, dec_hidden_state_size]
        # enc_outputs = [src_sent_len, batch_size, enc_hidden_state_size*2]
        
        batch_size = hidden.shape[0]
        src_seq_len = enc_outputs.shape[0]
        
        # calculating the energy 
        hidden = hidden.unsqueeze(1).repeat(1, src_seq_len, 1)
        enc_outputs = enc_outputs.permute(1, 0, 2)
        # hidden = [batch_size, src_seq_len, dec_hidden_state_size]
        # enc_outputs = [batch_size, src_seq_len, 2*enc_hidden_state_size]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, enc_outputs), dim=2)))
        # energy = [batch_size, src_seq_len, dec_hidden_state_size]
        # v = [dec_hidden_state_size]
        
        energy = energy.permute(0, 2, 1)
        # energy = [batch_size, dec_hidden_state_size, src_seq_len]
        
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        
        attn = torch.bmm(v, energy)
        # attn = [batch_size, 1, src_seq_len]
        
        attn = attn.squeeze(1)
        # attn = [batch_size, src_seq_len]

        return F.softmax(attn, dim=1)

Decoder Unit

In [11]:
class DecoderRnn(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_hidden_state_size, 
                 dec_hidden_state_size, num_layers=1, dropout=0.5, 
                 bidirectional=False):
        
        super(DecoderRnn, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.enc_hidden_state_size = enc_hidden_state_size
        self.dec_hidden_state_size = dec_hidden_state_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        
        self.attn_layer = AttentionLayer(self.dec_hidden_state_size, self.enc_hidden_state_size)
        
        self.rnn = nn.GRU(input_size=(2 * self.enc_hidden_state_size) + self.embedding_dim, hidden_size=self.dec_hidden_state_size,
                          num_layers=self.num_layers, dropout=self.dropout, bidirectional=self.bidirectional)
        
        self.linear_layer = nn.Linear((2 * self.enc_hidden_state_size) + self.embedding_dim + self.dec_hidden_state_size, self.vocab_size)
        
        self.dropout_layer = nn.Dropout(self.dropout)
        
    def forward(self, inp, dec_hidden_state, enc_outputs):
        # inp = [batch_size]
        # dec_hidden_state = [batch_size, dec_hidden_state]
        # enc_outputs = [src_seq_len, batch_size, 2*enc_hidden_state]
        
        inp = inp.unsqueeze(0)
        embedded = self.dropout_layer(self.embedding(inp))
        # embedded = [1, batch_size, embedding_dim]
        
        attn_weights = self.attn_layer(dec_hidden_state, enc_outputs).unsqueeze(1)
        # attn_weights = [batch_size, 1, src_seq_len]
        
        enc_outputs = enc_outputs.permute(1, 0, 2)
        # enc_outputs = [batch_size, src_seq_len, 2*embedding_dim]
        
        weighted = torch.bmm(attn_weights, enc_outputs).squeeze(1).unsqueeze(0)
        # weighted = [1, batch_size, 2*embedding_dim]
        
        rnn_input = torch.cat((embedded, weighted), dim=2)
        # rnn_input = [1, batch_size, 2*enc_hidden_state_size + embedding_dim]
        
        dec_outputs, dec_hidden_state = self.rnn(rnn_input, dec_hidden_state.unsqueeze(0))
        # dec_outputs == dec_hidden_state
        
        # dec_outputs = [1, batch_size, dec_hidden_state_size]
        # dec_hidden_state = [1, batch_size, dec_hidden_state_size]
        
        linear_layer_input = torch.cat((embedded.squeeze(0), weighted.squeeze(0), dec_outputs.squeeze(0)), dim=1)
        # linear_layer_input = [batch_size, 2*enc_hidden_state_size + embedding_dim + dec_hidden_state_size]
        
        outputs = self.linear_layer(linear_layer_input)
        # outputs = [batch_size, vocab_size]
        
        dec_hidden_state = dec_hidden_state.squeeze(0)
        
        return outputs, dec_hidden_state

Seq2Seq

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, trg_sos_idx, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [src_seq_len, batch_size]
        # trg = [trg_seq_len, batch_size]
        
        batch_size = src.shape[1]
        trg_seq_len = trg.shape[0]
        
        # final outputs from decoder
        final_outputs = torch.zeros((trg_seq_len, batch_size, self.decoder.vocab_size)).to(self.device)
        # setting first output as sos
        final_outputs[0, :, self.trg_sos_idx] = 0.98
        
        # encoder outputs
        enc_outputs, enc_hidden = self.encoder(src)
        dec_hidden_state = enc_hidden
        # print(enc_hidden.shape)
        dec_input = trg[0, :]
        for t in range(1, trg_seq_len):
            dec_outputs, dec_hidden_state = self.decoder(dec_input, dec_hidden_state, enc_outputs)
            final_outputs[t, :, :] = dec_outputs
            
            teacher_force = random.random() < teacher_forcing_ratio
            if teacher_force:
                dec_input = trg[t, :]
            else:
                dec_input = dec_outputs.max(1)[1]
        
        return final_outputs

In [13]:
train_pairs = prepare_data('../../courses/cse_842/bbc_data/train_split.csv')
test_pairs = prepare_data('../../courses/cse_842/bbc_data/test_split.csv')

1977 titles and content read.
349 titles and content read.


In [14]:
vocab = Vocab('title_content')
populate_vocab(vocab, train_pairs)

In [15]:
EMBEDDING_DIM = 50
HIDDEN_DIM = 64
VOCAB_SIZE = vocab.n_words
NUM_LAYERS_ENCODER = 2
NUM_LAYERS_DECODER = 1

# encoder = EncoderRnn(
#     hidden_size=HIDDEN_DIM, embedding_dim=EMBEDDING_DIM, num_layers=NUM_LAYERS,
#     vocab_size=VOCAB_SIZE
# )

# decoder = DecodeRnn(
#     vocab_size=VOCAB_SIZE, hidden_size=HIDDEN_DIM, embedding_dim=EMBEDDING_DIM, num_layers=NUM_LAYERS
# )

# s2s = Seq2Seq(
#     encoder, decoder, device
# )

encoder = EncoderRnn(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    enc_hidden_state_size=HIDDEN_DIM,
    dec_hidden_state_size=HIDDEN_DIM,
    num_layers=NUM_LAYERS_ENCODER,
    dropout=0.2,
    bidirectional=True
)

decoder = DecoderRnn(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    enc_hidden_state_size=HIDDEN_DIM,
    dec_hidden_state_size=HIDDEN_DIM,
    num_layers=NUM_LAYERS_DECODER,
    dropout=0.2,
    bidirectional=False
)

model = Seq2Seq(
    encoder, decoder, 0, device
)



In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,245,876 trainable parameters


In [17]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=2)

In [18]:
model = model.to(device)
criterion = criterion.to(device)

In [19]:
params = {'batch_size': 32,
          'shuffle': True,
#           'num_workers': 6,
          }

# Generators
training_set = Dataset(
    train_pairs, vocab=vocab, max_len_title=TITLE_MAX_LENGTH, 
    max_len_content=CONTENT_MAX_LENGTH
)
training_generator = data.DataLoader(training_set, **params)

val_set = Dataset(
    test_pairs, vocab=vocab, max_len_title=TITLE_MAX_LENGTH, 
    max_len_content=CONTENT_MAX_LENGTH
)
val_generator = data.DataLoader(val_set, **params)

In [20]:
len(training_generator), len(val_generator)

(62, 11)

In [21]:
def train(model, iterator, optimizer, criterion, clip, batch_size, device, teacher_forcing_ratio=0.25):
    model.train()
    
    epoch_loss = 0
    
    for i, batch in tqdm(enumerate(iterator)):
        
        src = batch[0].permute(1,0,2).squeeze(-1).to(device).contiguous()
        trg = batch[1].permute(1,0,2).squeeze(-1).to(device).contiguous()
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio=teacher_forcing_ratio)
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)


def evaluate(model, iterator, optimizer, criterion, clip, batch_size, device, teacher_forcing_ratio=0.2):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
    
        for i, batch in tqdm(enumerate(iterator)):
            src = batch[0].permute(1,0,2).squeeze(-1).to(device).contiguous()
            trg = batch[1].permute(1,0,2).squeeze(-1).to(device).contiguous()

            output = model(src, trg, teacher_forcing_ratio) #turn off teacher forcing
            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [23]:
N_EPOCHS = 50
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, training_generator, optimizer, criterion,
                       CLIP, params["batch_size"], device, teacher_forcing_ratio=0.1)
    valid_loss = evaluate(model, val_generator, optimizer, criterion,
                       CLIP, params["batch_size"], device, teacher_forcing_ratio=0.1)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

62it [00:02, 30.45it/s]
11it [00:00, 86.81it/s]


Epoch: 01 | Time: 0m 2s
	Train Loss: 7.947 | Train PPL: 2826.587
	 Val. Loss: 6.973 |  Val. PPL: 1067.667


62it [00:01, 36.57it/s]
11it [00:00, 88.95it/s]


Epoch: 02 | Time: 0m 1s
	Train Loss: 6.410 | Train PPL: 608.034
	 Val. Loss: 6.898 |  Val. PPL: 990.104


62it [00:01, 36.50it/s]
11it [00:00, 93.14it/s]


Epoch: 03 | Time: 0m 1s
	Train Loss: 6.176 | Train PPL: 481.131
	 Val. Loss: 6.914 |  Val. PPL: 1006.481


62it [00:01, 35.63it/s]
11it [00:00, 89.21it/s]


Epoch: 04 | Time: 0m 1s
	Train Loss: 6.068 | Train PPL: 431.735
	 Val. Loss: 6.977 |  Val. PPL: 1071.767


62it [00:01, 36.94it/s]
11it [00:00, 87.30it/s]


Epoch: 05 | Time: 0m 1s
	Train Loss: 5.980 | Train PPL: 395.629
	 Val. Loss: 6.997 |  Val. PPL: 1093.266


62it [00:01, 37.07it/s]
11it [00:00, 94.20it/s]


Epoch: 06 | Time: 0m 1s
	Train Loss: 5.902 | Train PPL: 365.900
	 Val. Loss: 7.066 |  Val. PPL: 1171.564


62it [00:01, 36.90it/s]
11it [00:00, 97.92it/s]


Epoch: 07 | Time: 0m 1s
	Train Loss: 5.827 | Train PPL: 339.229
	 Val. Loss: 6.976 |  Val. PPL: 1070.383


62it [00:01, 37.44it/s]
11it [00:00, 98.42it/s]


Epoch: 08 | Time: 0m 1s
	Train Loss: 5.695 | Train PPL: 297.351
	 Val. Loss: 7.010 |  Val. PPL: 1107.866


62it [00:01, 37.43it/s]
11it [00:00, 98.58it/s]


Epoch: 09 | Time: 0m 1s
	Train Loss: 5.528 | Train PPL: 251.537
	 Val. Loss: 6.951 |  Val. PPL: 1043.801


62it [00:01, 37.62it/s]
11it [00:00, 98.86it/s]


Epoch: 10 | Time: 0m 1s
	Train Loss: 5.363 | Train PPL: 213.446
	 Val. Loss: 6.999 |  Val. PPL: 1095.417


62it [00:01, 38.02it/s]
11it [00:00, 97.70it/s]


Epoch: 11 | Time: 0m 1s
	Train Loss: 5.217 | Train PPL: 184.292
	 Val. Loss: 7.017 |  Val. PPL: 1115.924


62it [00:01, 37.09it/s]
11it [00:00, 97.98it/s]


Epoch: 12 | Time: 0m 1s
	Train Loss: 5.058 | Train PPL: 157.289
	 Val. Loss: 7.005 |  Val. PPL: 1102.176


62it [00:01, 36.71it/s]
11it [00:00, 97.38it/s]


Epoch: 13 | Time: 0m 1s
	Train Loss: 4.912 | Train PPL: 135.908
	 Val. Loss: 7.035 |  Val. PPL: 1136.243


62it [00:01, 37.52it/s]
11it [00:00, 98.28it/s]


Epoch: 14 | Time: 0m 1s
	Train Loss: 4.754 | Train PPL: 116.007
	 Val. Loss: 7.047 |  Val. PPL: 1149.531


62it [00:01, 37.22it/s]
11it [00:00, 98.51it/s]


Epoch: 15 | Time: 0m 1s
	Train Loss: 4.598 | Train PPL:  99.248
	 Val. Loss: 7.070 |  Val. PPL: 1175.917


62it [00:01, 36.25it/s]
11it [00:00, 97.02it/s]


Epoch: 16 | Time: 0m 1s
	Train Loss: 4.435 | Train PPL:  84.341
	 Val. Loss: 7.065 |  Val. PPL: 1170.006


62it [00:01, 37.28it/s]
11it [00:00, 98.30it/s]


Epoch: 17 | Time: 0m 1s
	Train Loss: 4.290 | Train PPL:  72.972
	 Val. Loss: 7.092 |  Val. PPL: 1202.118


62it [00:01, 37.09it/s]
11it [00:00, 97.70it/s]


Epoch: 18 | Time: 0m 1s
	Train Loss: 4.138 | Train PPL:  62.676
	 Val. Loss: 7.113 |  Val. PPL: 1228.032


62it [00:01, 36.72it/s]
11it [00:00, 97.34it/s]


Epoch: 19 | Time: 0m 1s
	Train Loss: 3.990 | Train PPL:  54.055
	 Val. Loss: 7.131 |  Val. PPL: 1250.249


62it [00:01, 36.59it/s]
11it [00:00, 96.86it/s]


Epoch: 20 | Time: 0m 1s
	Train Loss: 3.868 | Train PPL:  47.834
	 Val. Loss: 7.162 |  Val. PPL: 1289.174


62it [00:01, 37.13it/s]
11it [00:00, 97.42it/s]


Epoch: 21 | Time: 0m 1s
	Train Loss: 3.734 | Train PPL:  41.830
	 Val. Loss: 7.140 |  Val. PPL: 1261.933


62it [00:01, 37.24it/s]
11it [00:00, 98.93it/s]


Epoch: 22 | Time: 0m 1s
	Train Loss: 3.603 | Train PPL:  36.712
	 Val. Loss: 7.180 |  Val. PPL: 1312.535


62it [00:01, 37.79it/s]
11it [00:00, 99.38it/s]


Epoch: 23 | Time: 0m 1s
	Train Loss: 3.490 | Train PPL:  32.798
	 Val. Loss: 7.172 |  Val. PPL: 1302.903


62it [00:01, 37.14it/s]
11it [00:00, 98.59it/s]


Epoch: 24 | Time: 0m 1s
	Train Loss: 3.364 | Train PPL:  28.919
	 Val. Loss: 7.183 |  Val. PPL: 1316.326


62it [00:01, 36.27it/s]
11it [00:00, 96.59it/s]


Epoch: 25 | Time: 0m 1s
	Train Loss: 3.251 | Train PPL:  25.805
	 Val. Loss: 7.238 |  Val. PPL: 1390.730


62it [00:01, 37.44it/s]
11it [00:00, 99.00it/s]


Epoch: 26 | Time: 0m 1s
	Train Loss: 3.150 | Train PPL:  23.334
	 Val. Loss: 7.254 |  Val. PPL: 1413.470


62it [00:01, 37.51it/s]
11it [00:00, 98.02it/s]


Epoch: 27 | Time: 0m 1s
	Train Loss: 3.035 | Train PPL:  20.803
	 Val. Loss: 7.272 |  Val. PPL: 1440.031


62it [00:01, 36.79it/s]
11it [00:00, 98.95it/s]


Epoch: 28 | Time: 0m 1s
	Train Loss: 2.922 | Train PPL:  18.578
	 Val. Loss: 7.306 |  Val. PPL: 1489.875


62it [00:01, 36.86it/s]
11it [00:00, 98.07it/s]


Epoch: 29 | Time: 0m 1s
	Train Loss: 2.818 | Train PPL:  16.748
	 Val. Loss: 7.308 |  Val. PPL: 1491.664


62it [00:01, 37.04it/s]
11it [00:00, 97.26it/s]


Epoch: 30 | Time: 0m 1s
	Train Loss: 2.724 | Train PPL:  15.239
	 Val. Loss: 7.332 |  Val. PPL: 1528.735


62it [00:01, 36.92it/s]
11it [00:00, 97.60it/s]


Epoch: 31 | Time: 0m 1s
	Train Loss: 2.631 | Train PPL:  13.891
	 Val. Loss: 7.341 |  Val. PPL: 1542.284


62it [00:01, 36.84it/s]
11it [00:00, 96.37it/s]


Epoch: 32 | Time: 0m 1s
	Train Loss: 2.531 | Train PPL:  12.571
	 Val. Loss: 7.351 |  Val. PPL: 1558.433


62it [00:01, 37.17it/s]
11it [00:00, 97.74it/s]


Epoch: 33 | Time: 0m 1s
	Train Loss: 2.443 | Train PPL:  11.511
	 Val. Loss: 7.375 |  Val. PPL: 1595.492


62it [00:01, 36.74it/s]
11it [00:00, 98.04it/s]


Epoch: 34 | Time: 0m 1s
	Train Loss: 2.339 | Train PPL:  10.372
	 Val. Loss: 7.378 |  Val. PPL: 1600.749


62it [00:01, 37.45it/s]
11it [00:00, 98.56it/s]


Epoch: 35 | Time: 0m 1s
	Train Loss: 2.260 | Train PPL:   9.585
	 Val. Loss: 7.426 |  Val. PPL: 1678.965


62it [00:01, 37.02it/s]
11it [00:00, 97.42it/s]


Epoch: 36 | Time: 0m 1s
	Train Loss: 2.188 | Train PPL:   8.914
	 Val. Loss: 7.440 |  Val. PPL: 1703.199


62it [00:01, 37.05it/s]
11it [00:00, 96.34it/s]


Epoch: 37 | Time: 0m 1s
	Train Loss: 2.102 | Train PPL:   8.182
	 Val. Loss: 7.444 |  Val. PPL: 1709.054


62it [00:01, 36.79it/s]
11it [00:00, 99.25it/s]


Epoch: 38 | Time: 0m 1s
	Train Loss: 2.026 | Train PPL:   7.580
	 Val. Loss: 7.493 |  Val. PPL: 1795.477


62it [00:01, 38.01it/s]
11it [00:00, 98.65it/s]


Epoch: 39 | Time: 0m 1s
	Train Loss: 1.954 | Train PPL:   7.055
	 Val. Loss: 7.488 |  Val. PPL: 1786.595


62it [00:01, 38.15it/s]
11it [00:00, 98.32it/s]


Epoch: 40 | Time: 0m 1s
	Train Loss: 1.878 | Train PPL:   6.539
	 Val. Loss: 7.562 |  Val. PPL: 1924.334


62it [00:01, 36.57it/s]
11it [00:00, 98.67it/s]


Epoch: 41 | Time: 0m 1s
	Train Loss: 1.805 | Train PPL:   6.077
	 Val. Loss: 7.575 |  Val. PPL: 1949.330


62it [00:01, 38.15it/s]
11it [00:00, 97.40it/s]


Epoch: 42 | Time: 0m 1s
	Train Loss: 1.745 | Train PPL:   5.727
	 Val. Loss: 7.591 |  Val. PPL: 1979.744


62it [00:01, 37.67it/s]
11it [00:00, 98.60it/s]


Epoch: 43 | Time: 0m 1s
	Train Loss: 1.674 | Train PPL:   5.333
	 Val. Loss: 7.633 |  Val. PPL: 2064.635


62it [00:01, 36.98it/s]
11it [00:00, 96.88it/s]


Epoch: 44 | Time: 0m 1s
	Train Loss: 1.613 | Train PPL:   5.016
	 Val. Loss: 7.615 |  Val. PPL: 2029.312


62it [00:01, 35.81it/s]
11it [00:00, 97.43it/s]


Epoch: 45 | Time: 0m 1s
	Train Loss: 1.545 | Train PPL:   4.686
	 Val. Loss: 7.639 |  Val. PPL: 2077.245


62it [00:01, 37.06it/s]
11it [00:00, 94.77it/s]


Epoch: 46 | Time: 0m 1s
	Train Loss: 1.491 | Train PPL:   4.440
	 Val. Loss: 7.681 |  Val. PPL: 2167.573


62it [00:01, 36.58it/s]
11it [00:00, 97.55it/s]


Epoch: 47 | Time: 0m 1s
	Train Loss: 1.437 | Train PPL:   4.210
	 Val. Loss: 7.695 |  Val. PPL: 2197.057


62it [00:01, 36.06it/s]
11it [00:00, 97.00it/s]


Epoch: 48 | Time: 0m 1s
	Train Loss: 1.378 | Train PPL:   3.966
	 Val. Loss: 7.673 |  Val. PPL: 2150.292


62it [00:01, 36.74it/s]
11it [00:00, 98.50it/s]


Epoch: 49 | Time: 0m 1s
	Train Loss: 1.320 | Train PPL:   3.742
	 Val. Loss: 7.694 |  Val. PPL: 2194.221


62it [00:01, 36.95it/s]
11it [00:00, 98.31it/s]

Epoch: 50 | Time: 0m 1s
	Train Loss: 1.277 | Train PPL:   3.585
	 Val. Loss: 7.781 |  Val. PPL: 2395.174





In [24]:
train_loss, math.exp(valid_loss)

(1.2768727252560277, 2395.1743135989386)

In [55]:
def strip_eos_sos(ids, vocab):
    tokens = [vocab.index2word[w] for w in ids]
    real_tokens = []
    for tok in tokens[1:]:
        if tok == 'EOS':
            break
        real_tokens.append(tok)
    
    return ' '.join(real_tokens)

In [56]:
def generate_title(content, title, vocab, model, title_max_len, content_max_len):
    model.eval()
    src = tensorFromSentence(vocab, content, content_max_len)
    trg = tensorFromSentence(vocab, title, title_max_len)
    
    out = model(src, trg, teacher_forcing_ratio=0.5)
    out = F.softmax(out, dim=2)
    predictions = out.max(2)[1].view(-1)
    pred_sentence = strip_eos_sos(predictions.tolist(), vocab)
    gt_sentence = strip_eos_sos(trg.view(-1).tolist(), vocab)
    print("       Input == {}".format(" ".join([vocab.index2word[w] for w in src.view(-1).tolist()])))
    print("Model Output == {}".format(pred_sentence))
    print("Ground Truth == {}".format(gt_sentence))
    return pred_sentence, gt_sentence

In [57]:
out = generate_title(train_pairs[10][1], train_pairs[10][0], vocab, model, title_max_len=TITLE_MAX_LENGTH, content_max_len=CONTENT_MAX_LENGTH)

       Input == SOS tony blair has told labour supporters he s back and still hungry for the job of prime minister but does that sum up the mood at the party s spring conference in gateshead ? the electorate are keener on the government than some labour party members is the dry assessment of graham lane leader of the labour group on newham council . the problem according to mr lane is not continuing divisions over iraq foundation hospitals or tuition fees or even voter apathy but mr blair himself . i have a new slogan . vote blair get brown EOS
Model Output == labour s core support ahead stock
Ground Truth == labour s core support takes stock


In [58]:
targets = []
predictions = []
for p in test_pairs:
    pred, tar = generate_title(p[1], p[0], vocab, model, title_max_len=TITLE_MAX_LENGTH, content_max_len=CONTENT_MAX_LENGTH)
    targets.append(tar)
    predictions.append(pred)
    print()

       Input == SOS terrorists might try to target the uk in the run up to the election london s most senior police officer has said . sir ian blair said terror groups would remember the effect of the madrid bomb on spain s general election last year . other potential targets were the royal wedding and the uk s presidency of the european union and g he said . he refused to say if there was specific information about the risk of a pre poll attack . no was similarly cautious but said the threat was real . the comments come EOS
Model Output == no will won t battle
Ground Truth == election could be terror target

       Input == SOS goals from gregory vignal and nacho novo gave rangers a scrappy victory at celtic park that moves them three points clear of the champions . rangers had rarely threatened until celtic goalkeeper sir douglas let defender vignal s yard drive slip through his grasp and into the net . opposite number ronald sir had been rangers hero saving superbly from craig bella

In [64]:
rouge = Rouge()

scores = rouge.get_scores(predictions, targets, avg=True)

scores