#### Import Package

In [1]:
import time
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import random
from torchtext.data import BucketIterator
from torch.utils.data import Dataset
import spacy
import pandas as pd
from tqdm import tqdm

#### Load Dataset

In [2]:
class CNNDailyDataset(Dataset):

    def __init__(self, path, transforms, vocabs):

        self.transform_src = transforms[0]
        self.transform_trg = transforms[1]

        self.srcs = []
        self.trgs = []

        print(f"Load dataset: {path}")
        df = pd.read_csv(path)
        print(f"Transform dataset")
        for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
            self.srcs.append(self.transform_src(row['article']))
            self.trgs.append(self.transform_trg(row['highlights']))
        
        if vocabs != None:
            self.vocab_stoi = vocabs[0]
            self.vocab_itos = vocabs[1]
        else:
            self.vocab_stoi = {}
            self.vocab_itos = {}
            print(f"Build vocab table")
            self.build_vocab()

    def __len__(self):
        return len(self.srcs)
    
    def __getitem__(self, index):

        src = []
        for token in self.srcs[index]:
            if token in self.vocab_stoi:
                src.append(self.vocab_stoi[token])
            else:
                src.append(self.vocab_stoi["<unk>"])
        
        trg = []
        for token in self.trgs[index]:
            if token in self.vocab_stoi:
                trg.append(self.vocab_stoi[token])
            else:
                trg.append(self.vocab_stoi["<unk>"])

        return {'src': src, 'trg': trg}

    def build_vocab(self):
        dict = {}
        for sent in self.srcs:
            for token in sent:
                if token not in dict:
                    dict[token] = 1
                else:
                    dict[token] += 1
        for sent in self.trgs:
            for token in sent:
                if token not in dict:
                    dict[token] = 1
                else:
                    dict[token] += 1
        count = 0
        for key, value in dict.items():
            if value > 100:
                self.vocab_stoi[key] = count
                self.vocab_itos[count] = key
                count += 1
        self.vocab_stoi["<unk>"] = count
        self.vocab_itos[count] = "<unk>"
        count += 1
        self.vocab_stoi["<pad>"] = count
        self.vocab_itos[count] = "<pad>"
        
    
    def pad_batch(self, batch):
        src_tensors = [torch.tensor(emp['src']) for emp in batch]
        trg_tensors = [torch.tensor(emp['trg']) for emp in batch]
        return pad_sequence(src_tensors, padding_value=self.vocab_stoi["<pad>"]), pad_sequence(trg_tensors, padding_value=self.vocab_stoi["<pad>"])

In [3]:
spacy_en = spacy.load("en_core_web_sm")

def transform_src(text):
    tokens = spacy_en.tokenizer(text.lower())
    tokens = [tok.text for tok in tokens]
    tokens = ['<sos>'] + tokens + ['<eos>']
    return tokens

def transform_trg(text):
    tokens = spacy_en.tokenizer(text.lower())
    tokens = [tok.text for tok in tokens]
    tokens = ['<sos>'] + tokens + ['<eos>']
    return tokens

train_dataset = CNNDailyDataset(
    path="../cnn_daily_ds/train.csv",
    transforms=(transform_src, transform_trg),
    vocabs=None
)

test_dataset = CNNDailyDataset(
    path="../cnn_daily_ds/test.csv",
    transforms=(transform_src, transform_trg),
    vocabs=(train_dataset.vocab_stoi, train_dataset.vocab_itos)
)

Load dataset: ../cnn_daily_ds/train.csv
Transform dataset


100%|██████████| 50000/50000 [00:43<00:00, 1139.32it/s]


Build vocab table
Load dataset: ../cnn_daily_ds/test.csv
Transform dataset


100%|██████████| 5000/5000 [00:03<00:00, 1417.51it/s]


In [6]:
INPUT_DIM = len(train_dataset.vocab_stoi)
OUTPUT_DIM = len(train_dataset.vocab_stoi)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
LSTM_HIDDEN_DIM = 512
LSTM_NUM_LAYER = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
BATCH_SIZE = 1

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Current device: {device}")

test_iterator = BucketIterator(
    test_dataset,
    batch_size = BATCH_SIZE,
    device = device,
    sort_key=lambda x: len(x['src']),
    repeat=True,
    sort=False,
    shuffle=False,
    sort_within_batch=True
)

Current device: cpu


#### Build Model

In [8]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout, debug=False):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.debug = debug

        """
        Input a batch of examples with shape, (batch_size, sequence_length),
        embedding layer gives each token a dense embedding vector, and output
        a tensor with shape, (batch_size, sequence_length, embedding_dim).

        input_dim: dimension of one-hot encoding of each token in source language
        emb_dim: dimension of embedding of each token
        """
        self.embedding = nn.Embedding(input_dim, emb_dim)

        """
        RNN consists of two-layers LSTM. In each time step, we feed a vector
        into LSTM with its previous cell state, it outputs two tensor, 
        hidden state and cell state.

        emb_dim: dimension of each token
        hid_dim: dimension of hidden state and cell state in LSTM
        n_layers: number of LSTM which will be stacked on top of each other

        In multi-layer LSTM, in every time step, hidden state output from 
        layer(i) LSTM is the input of layer(i+1) LSTM. Hence, the argument, 
        dropout, specifies how similar the hidden state output from layer(i) LSTM 
        and the input of layer(i+1) LSTM.
        """
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input_batch):
        if self.debug:
            print(f'(in encoder\'s forward) input_batch shape = {input_batch.shape}')
        
        input_batch = input_batch.to(device)
        embeded = self.embedding(input_batch)

        if self.debug:
            print(f'(in encoder\'s forward) embeded shape = {embeded.shape}')

        embeded = self.dropout(embeded)

        """
        outputs: hidden state of top layer lstm in all time steps
        hidden_state: final hidden state of each layer lstm
        cell state: final cell state of each layer lstm
        """
        outputs, (hidden_state, cell_state) = self.rnn(embeded)

        if self.debug:
            print(f'(in encoder\'s forward) rnn output shape:')
            print(f'\toutputs = {outputs.shape}')
            print(f'\thidden_state = {hidden_state.shape}')
            print(f'\tcell_state = {cell_state.shape}')
        
        return hidden_state, cell_state

In [9]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, debug=False):
        super().__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.debug = debug
        

        """
        output_dim: dimension of one-hot encoding of each token in target language
        emb_dim: dimension of embedding of each token
        """
        self.embedding = nn.Embedding(output_dim, emb_dim)

        """
        emb_dim: dimension of each token
        hid_dim: dimension of hidden state and cell state in LSTM
        n_layers: number of LSTM which will be stacked on top of another one
        """
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout = dropout)

        """
        feed hidden states of top layer lstm to linear function to prediction
        next token
        """
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_batch, initial_hidden, initial_cell):
        if self.debug:
            print(f'(in decoder\'s forward) input_batch shape = {input_batch.shape}')
        
        input_batch = input_batch.unsqueeze(0)

        if self.debug:
            print(f'(in decoder\'s forward) input_batch (after unsqueeze) shape = {input_batch.shape}')

        input_batch = input_batch.to(device)
        embeded = self.embedding(input_batch)
        embeded = self.dropout(embeded)

        if self.debug:
            print(f'(in decoder\'s forward) embeded shape = {embeded.shape}')

        
        outputs, (hidden_state, cell_state) = self.rnn(embeded, (initial_hidden, initial_cell))

        if self.debug:
            print(f'(in decoder\'s forward) rnn output shape:')
            print(f'\toutputs = {outputs.shape}')
            print(f'\thidden_state = {hidden_state.shape}')
            print(f'\tcell_state = {cell_state.shape}')

        outputs = outputs.squeeze(0)

        if self.debug:
            print(f'(in decoder\'s forward) outputs (after squeeze) shape = {outputs.shape}')
        
        prediction = self.fc_out(outputs)

        if self.debug:
            print(f'(in decoder\'s forward) prediction shape = {prediction.shape}')
        
        return prediction, hidden_state, cell_state

In [10]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hidden_dim == decoder.hidden_dim, "Hidden dimension of LSTM in encoder and decoder should be equal"
        assert encoder.n_layers == decoder.n_layers, "Number of LSTM in encoder and decoder should be equal"

    def forward(self, source_batch, target_batch, teacher_forcing_ratio = 0.5):
        
        batch_size = target_batch.shape[1]
        target_length = target_batch.shape[0]
        target_vocab_size = self.decoder.output_dim

        # print(f"[seq2seq forward] source_batch shape: {source_batch.shape}")
        # print(f"[seq2seq forward] target_batch shape: {target_batch.shape}")

        # a big tensor to store decoder's output
        outputs = torch.zeros(target_length, batch_size, target_vocab_size).to(self.device)
        # print(f"[seq2seq forward] outputs shape: {outputs.shape}")

        # context vector of encoder to initialize decoder
        hidden_state, cell_state = self.encoder(source_batch)
        # print(f"[seq2seq forward] hidden_state shape: {hidden_state.shape}")
        # print(f"[seq2seq forward] cell_state shape: {cell_state.shape}")

        # first input token to decoder (<sos>)
        input = source_batch[0, :]

        for t in range(1, target_length):
            # print(f"[seq2seq forward] t: {t}")

            # print(f"[seq2seq forward] input shape: {input.shape}")
            output, hidden_state, cell_state = self.decoder(input, hidden_state, cell_state)

            # print(f"[seq2seq forward] output shape: {output.shape}")
            # print(f"[seq2seq forward] hidden_state shape: {hidden_state.shape}")
            # print(f"[seq2seq forward] cell_state shape: {cell_state.shape}")

            outputs[t] = output
            
            if random.random() < teacher_forcing_ratio:
                input = target_batch[t, :]
            else:
                input = output.argmax(1)
        
        # print(f"[seq2seq forward] outputs shape: {outputs.shape}")
        return outputs

In [11]:
enc = Encoder(
    input_dim = INPUT_DIM, 
    emb_dim = ENC_EMB_DIM,
    hidden_dim = LSTM_HIDDEN_DIM,
    n_layers = LSTM_NUM_LAYER, 
    dropout = ENC_DROPOUT)

dec = Decoder(
    output_dim = OUTPUT_DIM, 
    emb_dim = DEC_EMB_DIM, 
    hidden_dim = LSTM_HIDDEN_DIM, 
    n_layers = LSTM_NUM_LAYER, 
    dropout = DEC_DROPOUT)

model = Seq2Seq(
    encoder=enc,
    decoder=dec,
    device=device
).to(device)

In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 23,865,066 trainable parameters


In [14]:
model.load_state_dict(torch.load('model.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

#### Evaluate

In [15]:
target_pad_token_idx =  train_dataset.vocab_stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=target_pad_token_idx)

In [18]:
def evaluate(model, iterator, criterion):

    # change model to evaluation mode
    model.eval()

    epoch_loss = 0
    
    # because we do not update parameters in model, using torch.no_grad() will speed up execution.
    with torch.no_grad():

        iterator.create_batches()
        i = 1
        for batch in iterator.batches:

            src_batch, trg_batch = test_dataset.pad_batch(batch)

            output = model(src_batch, trg_batch, 0) # turn off teacher forcing
            # output's shape = [num_token, batch_size, output_dim]

            # slice out first token in model's output and target
            output = output[1:]
            trg_batch = trg_batch[1:]

            # reshape model's output to 2-dimension, target to 1-dimension
            output_dim = output.shape[-1]
            output = output.view(-1, output_dim)
            trg_batch = trg_batch.view(-1)
            trg_batch = trg_batch.to(device)

            # calculate loss
            loss = criterion(output, trg_batch)

            epoch_loss += loss.item()

            if (i+1)%10 == 0:
                print(f"Batch: {i+1:03}/{len(iterator)}, loss: {loss.item()}")
            i += 1

    return epoch_loss / len(iterator)

In [None]:
test_loss = evaluate(model, test_iterator, criterion)

In [None]:
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

#### Inference

In [22]:
def evaluate_(model, iterator):

    # change model to evaluation mode
    model.eval()

    pred_lst = []
    gt_lst = []
    
    # because we do not update parameters in model, using torch.no_grad() will speed up execution.
    with torch.no_grad():

        i = 0

        iterator.create_batches()
   
        for batch in iterator.batches:

            src_batch, trg_batch = test_dataset.pad_batch(batch)

            output = model(src_batch, trg_batch, 0) # turn off teacher forcing
            # output's shape = [num_token, batch_size, output_dim]

            # slice out first token in model's output and target
            output = output[1:]
            trg_batch = trg_batch[1:]

            pred_lst.append(output)
            gt_lst.append(trg_batch)

            # reshape model's output to 2-dimension, target to 1-dimension
            # output_dim = output.shape[-1]
            # output = output.view(-1, output_dim)
            # trg_batch = trg_batch.view(-1)
            i += 1

            if i % 100 == 0:
                print(i)

    return pred_lst, gt_lst

In [23]:
pred, gt = evaluate_(model, test_iterator)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000


In [29]:
summary_lst = []
reference_lst = []

for idx in range(len(pred)):
    pred_tokens = []
    gt_tokens = []

    pred_ = torch.nn.functional.softmax(pred[idx][:, 0, :], dim=1)
    _, pred_ = torch.max(pred_, dim=1)
    for token in pred_:
        pred_tokens.append(train_dataset.vocab_itos[int(token)])
    
    for token in gt[idx].view(-1):
        gt_tokens.append(train_dataset.vocab_itos[int(token)])
    
    summary = f"{' '.join(pred_tokens).replace('<unk>', '')}"
    summary_lst.append(summary)

    reference = f"{' '.join(gt_tokens).replace('<unk>', '')}"
    reference_lst.append(reference)

    if idx % 100 == 0:
        print(idx)

    # print(f"predict: {' '.join(pred_tokens).replace('<unk>', '')}")
    # print(f"groundtruth: {' '.join(gt_tokens).replace('<unk>', '')}")
    # print()

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


#### BLEU & ROUGE

In [39]:
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu

In [40]:
ROUGE = Rouge()

In [41]:
rouge_1_total = 0
rouge_2_total = 0
rouge_l_total = 0
bleu_total = 0

for sy, rf in zip(summary_lst, reference_lst):
    rouge = ROUGE.get_scores(sy, rf)
    rouge_1_total += rouge[0]['rouge-1']['r']
    rouge_2_total += rouge[0]['rouge-2']['r']
    rouge_l_total += rouge[0]['rouge-l']['r']
    bleu_total += sentence_bleu(rf, sy)

print(f"ROUGE-1: {rouge_1_total / 5000}")
print(f"ROUGE-2: {rouge_2_total / 5000}")
print(f"ROUGE-l: {rouge_l_total / 5000}")
print(f"BLEU: {bleu_total / 5000}")

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


ROUGE-1: 0.06677780079230333
ROUGE-2: 0.00700888881596862
ROUGE-l: 0.06515898534334971
BLEU: 1.0174899187054472e-231
