In [1]:
import sys
sys.path.insert(0, ".") 

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Setup

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import nltk
import torchtext
from torchtext.data import Example, Field, BucketIterator, TabularDataset, Iterator
from tqdm import tqdm, tnrange, tqdm_notebook, trange
import numpy as np
from __future__ import print_function

from model import Encoder, Decoder

In [4]:
torch.__version__

'0.3.0'

In [5]:
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
!nvidia-smi

Gen RAM Free: 31.4 GB  | Proc size: 161.5 MB
GPU RAM Free: 12206MB | Used: 0MB | Util   0% | Total 12206MB
Wed Mar 28 08:49:36 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81                 Driver Version: 384.81                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX TIT...  Off  | 00000000:03:00.0 Off |                  N/A |
|  0%   77C    P0    68W / 250W |      0MiB / 12206MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                 

In [6]:
PATH="/diskA/jethro/cnn"

In [7]:
os.listdir(PATH)

['valid.feather',
 'test.feather',
 'train.pkl',
 'raw',
 'decoder.model',
 'train.tsv',
 'encoder.model',
 'stories.feather',
 'test.tsv',
 'valid.tsv',
 'train.feather']

In [8]:
TEXT = Field(tokenize=nltk.word_tokenize,use_vocab=True,lower=True, include_lengths=True, batch_first=True)

In [9]:
USE_CUDA = 1

In [10]:
train_data = TabularDataset(path=f'{PATH}/train.tsv',
                            format='tsv',
                            fields=[('input',TEXT), ('target',TEXT)])

In [11]:
test_data = TabularDataset(path=f'{PATH}/test.tsv',
                            format='tsv',
                            fields=[('input',TEXT), ('target',TEXT)])

In [12]:
valid_data = TabularDataset(path=f'{PATH}/valid.tsv',
                           format='tsv',
                           fields=[('input', TEXT), ('target', TEXT)])

In [13]:
TEXT.build_vocab(train_data, test_data, valid_data, min_freq=2)

tqdm.write("Vocabulary size: {}".format(len(TEXT.vocab)))

Vocabulary size: 83092


In [14]:
BATCH_SIZE = 16
train_loader = BucketIterator(train_data,batch_size=BATCH_SIZE, device=None,
                              sort_key=lambda x: len(x.input),sort_within_batch=True,
                              repeat=False,shuffle=True)
test_loader  = BucketIterator(test_data,batch_size=1, device=None,
                              sort_key=lambda x: len(x.input),sort_within_batch=True,
                              repeat=False,shuffle=True)
valid_loader = BucketIterator(valid_data,batch_size=1, device=None,
                              sort_key=lambda x: len(x.input),sort_within_batch=True,
                              repeat=False,shuffle=True)
 # May be slightly less due to skipping empty stories
tqdm.write("Number of training stories: {}".format(len(train_data)))
tqdm.write("Number of testing stories: {}".format(len(test_data)))
tqdm.write("Number of validation stories: {}".format(len(valid_data)))

Number of training stories: 73972
Number of testing stories: 14794
Number of validation stories: 3699


In [15]:
HIDDEN = 100
EMBED = 300
VOCAB_SIZE = len(TEXT.vocab)
LR = 0.001

In [16]:
encoder = Encoder(VOCAB_SIZE,EMBED,HIDDEN,bidirec=True)
decoder = Decoder(VOCAB_SIZE,EMBED,HIDDEN*2)

In [17]:
if USE_CUDA:
    tqdm.write("Using CUDA")
    if torch.cuda.device_count() > 1:
        print("Using %d devices" % (torch.cuda.device_count()))
        encoder = nn.DataParallel(encoder)
        decoder = nn.DataParallel(decoder)
    encoder = encoder.cuda()
    decoder = decoder.cuda()
decoder.embedding = encoder.embedding

Using CUDA


In [18]:
loss_function = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>'])
enc_optim = optim.Adam(encoder.parameters(),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR)

In [19]:
ENCODER_MODEL_PATH = f'{PATH}/encoder.model'
DECODER_MODEL_PATH = f'{PATH}/decoder.model'

def load_models():
    encoder.load_state_dict(torch.load(ENCODER_MODEL_PATH))
    decoder.load_state_dict(torch.load(DECODER_MODEL_PATH))
    
def save_models():
    torch.save(encoder.state_dict(), ENCODER_MODEL_PATH)
    torch.save(decoder.state_dict(), DECODER_MODEL_PATH)

In [20]:
def train(train_loader):
    global encoder
    global decoder
    encoder = encoder.train()
    decoder = decoder.train()
    total_loss, total_squared_loss, num_batches = 0.0, 0.0, 0
    for batch in tqdm_notebook(train_loader, desc="Training Batches"):
        inputs,lengths = batch.input
        targets,_ = batch.target
        decoding_start = Variable(torch.LongTensor([TEXT.vocab.stoi['<s>']]*targets.size(0))).unsqueeze(1)
        if USE_CUDA:
            inputs = inputs.cuda()
            targets = targets.cuda()
            decoding_start = decoding_start.cuda()

        encoder.zero_grad()
        decoder.zero_grad()
        output,hidden = encoder(inputs,lengths.tolist())
        score = decoder(decoding_start,hidden,targets.size(1),output,lengths)

        loss = loss_function(score,targets.view(-1))
        total_loss += loss.data[0]
        total_squared_loss += loss.data[0]**2
        num_batches += 1
        loss.backward()
        enc_optim.step()
        dec_optim.step()
    loss_mean = total_loss / num_batches
    loss_variance = (total_squared_loss - (total_loss**2 / num_batches)) / (num_batches - 1)
    tqdm.write("Training: loss mean: %7.4f, loss variance: %7.4f" % (loss_mean, loss_variance))

In [21]:
def calculate_validation_loss(valid_loader):
    global encoder
    global decoder
    encoder = encoder.eval()
    decoder = decoder.eval()
    total_loss, total_squared_loss, num_batches = 0.0, 0.0, 0
    for batch in tqdm_notebook(valid_loader, desc="Validation Batches"):
        inputs,lengths = batch.input
        targets,_ = batch.target
        decoding_start = Variable(torch.LongTensor([TEXT.vocab.stoi['<s>']]*targets.size(0))).unsqueeze(1)
        if USE_CUDA:
            inputs = inputs.cuda()
            targets = targets.cuda()
            decoding_start = decoding_start.cuda()

        encoder.zero_grad()
        decoder.zero_grad()
        output,hidden = encoder(inputs,lengths.tolist())
        score = decoder(decoding_start,hidden,targets.size(1),output,lengths)

        loss = loss_function(score,targets.view(-1))
        total_loss += loss.data[0]
        total_squared_loss += loss.data[0]**2
        num_batches += 1
    loss_mean = total_loss / num_batches
    loss_variance = (total_squared_loss - (total_loss**2 / num_batches)) / (num_batches - 1)
    tqdm.write("Validation: loss mean: %7.4f, loss variance: %7.4f" % (loss_mean, loss_variance))

In [22]:
load_models()

In [25]:
NUM_EPOCHS = 100
for epoch_idx in tnrange(NUM_EPOCHS, desc="Epochs", unit="epoch"):
    train(train_loader)
    calculate_validation_loss(valid_loader)
    save_models()

Training: loss mean:  3.4017, loss variance:  0.0783


Validation: loss mean:  8.6416, loss variance:  7.0048


Training: loss mean:  3.2993, loss variance:  0.0672


Validation: loss mean:  8.7326, loss variance:  7.4038





KeyboardInterrupt: 

# Saving the model

In [23]:
from rouge import ROUGE
from __future__ import print_function
rouge = ROUGE()

def get_string(summary):
    result = ""
    for idx in summary:
        if idx in [0, 1]: # <unk> and <pad>
            continue
        if idx < len(TEXT.vocab.itos):
            result += (TEXT.vocab.itos[idx] + " ")
    return result

def show_selection_of_output(encoder, decoder, loader, num_to_show = 5, num_to_calculate = 100):
    total_rouge_score = {"rouge-1": {"recall": 0.0, "precision": 0.0},
                         "rouge-2": {"recall": 0.0, "precision": 0.0}}
    encoder = encoder.eval()
    decoder = decoder.eval()
    for i, batch in enumerate(loader):
        if i == num_to_calculate:
            break
        inputs, lengths = batch.input
        targets, _ = batch.target
        decoding_start = Variable(torch.LongTensor([TEXT.vocab.stoi['<s>']]*targets.size(0))).unsqueeze(1)
        if USE_CUDA:
            inputs = inputs.cuda()
            targets = targets.cuda()
            decoding_start = decoding_start.cuda()

        output,hidden = encoder(inputs,lengths.tolist())
        score = decoder(decoding_start,hidden,targets.size(1),output,lengths)

        reference_article = inputs.data.cpu().numpy()[0]
        reference_summary = targets.data.cpu().numpy()[0]
        generated_summary = [np.argmax(word) for word in score.data.cpu().numpy()[0]]

        reference_article = get_string(reference_article)
        reference = get_string(reference_summary)
        generated = get_string(generated_summary)

        rouge_score = rouge.score(reference, generated)
        
        total_rouge_score["rouge-1"]["recall"] += rouge_score["rouge-1"]["recall"]
        total_rouge_score["rouge-1"]["precision"] += rouge_score["rouge-1"]["precision"]
        total_rouge_score["rouge-2"]["recall"] += rouge_score["rouge-2"]["recall"]
        total_rouge_score["rouge-2"]["precision"] += rouge_score["rouge-2"]["precision"]

        if i < num_to_show:
            print("\nReference article:\n{}".format(reference_article))
            print("\nReference summary:\n{}".format(reference))
            print("\nGenerated summary:\n{}".format(generated))
            print("\nROUGE score: {}\n".format(rouge_score))
        
    total_rouge_score["rouge-1"]["recall"] /= num_to_show
    total_rouge_score["rouge-1"]["precision"] /= num_to_show
    total_rouge_score["rouge-2"]["recall"] /= num_to_show
    total_rouge_score["rouge-2"]["precision"] /= num_to_show
    print("Mean ROUGE score: {}\n".format(total_rouge_score))

In [24]:
show_selection_of_output(encoder, decoder, test_loader)


Reference article:
`` cairo seven coptic egyptians living abroad were sentenced to death wednesday by a court in cairo for their connection to an inflammatory anti-islam film , the prosecutor 's office said . the suspects are accused of being involved with the production of the film in california , said adel al saeed , official spokesman for the prosecutor 's office . since the egyptian citizens were tried in absentia , the sentence would be applied only if they returned to egypt . protests against the `` '' innocence of muslims `` '' film erupted in september in many muslim countries , including egypt '' 

Reference summary:
prosecutor : 7 coptic egyptians are sentenced to death over involvement in an anti-islam film 

Generated summary:


ROUGE score: {'rouge-1': {'recall': 0.0, 'precision': 0.0}, 'rouge-2': {'recall': 0.0, 'precision': 0.0}}


Reference article:
washington the wall street journal reported friday that office rents in washington are poised to overtake rents in new yo