In [1]:
import sys
sys.path.insert(0, ".") 

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Setup

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import nltk
import torchtext
from torchtext.data import Example, Field,BucketIterator, TabularDataset
from tqdm import tqdm, tnrange, tqdm_notebook, trange
import numpy as np
from __future__ import print_function

from model import Encoder, Decoder

In [4]:
torch.__version__

'0.3.0'

In [5]:
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
!nvidia-smi

Gen RAM Free: 28.7 GB  | Proc size: 162.7 MB
GPU RAM Free: 12206MB | Used: 0MB | Util   0% | Total 12206MB
Mon Mar 26 14:29:53 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81                 Driver Version: 384.81                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX TIT...  Off  | 00000000:03:00.0 Off |                  N/A |
|  0%   47C    P0    58W / 250W |      0MiB / 12206MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                 

In [6]:
PATH="/diskA/jethro/cnn"

In [7]:
TEXT = Field(tokenize=nltk.word_tokenize,use_vocab=True,lower=True, include_lengths=True, batch_first=True)

In [8]:
MINI_BATCH_SIZE = 10

In [9]:
USE_CUDA = 1

In [11]:
import pandas as pd

In [12]:
df = pd.read_feather(f'{PATH}/stories.feather')

In [13]:
df.head()

Unnamed: 0,story,summary
0,"Fort Hood , Texas The shooting at Fort Hood w...",NEW : Pfc. Lance Aviles says he was ordered to...
1,WikiLeaks founder Julian Assange is trying to ...,Julian Assange is seeking to avoid extradition...
2,The news that Scotland has rejected independen...,Hammond : UK allies will breathe sigh of relie...
3,BP plans to continue using a controversial sub...,"EPA says it "" will continue to work over the ..."
4,Barcelona may be licking their wounds after a ...,Barcelona 's Eric Abidal is given the all clea...


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train, test = train_test_split(df, test_size=0.2)

In [16]:
len(train), len(test)

(74063, 18516)

In [77]:
class CNNDataset(torchtext.data.Dataset):
    def __init__(self, path, text_field, label_field, dfs, **kwargs):
        fields = [("text", text_field), ("label", label_field)]
        examples = []
        num_examples = dfs[path].values[:,1].shape[0]
        for i in range(num_examples):
            text = dfs[path]["story"].iloc[i]
            label = dfs[path]["summary"].iloc[i]
            examples.append(Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex): return len(ex.text)
    
    @classmethod
    def splits(cls, text_field, label_field, train, validation=None, test=None, **kwargs):
        dfs = {'train': train}
        if validation is not None:
            dfs['validation'] = validation
            has_validation = 'validation'
        else:
            has_validation = None
        if test is not None:
            dfs['test'] = test
            has_test = 'test'
        else:
            has_test = None
                
        return super().splits('',
            text_field=text_field, label_field=label_field,
                              train='train', validation=has_validation, test=has_test,  dfs=dfs, **kwargs)

In [84]:
TEXT = Field(tokenize=nltk.word_tokenize,use_vocab=True,lower=True, include_lengths=True, batch_first=True)

In [85]:
TEXT = Field()

pytrain, pytest = CNNDataset.splits(text_field=TEXT,
                                    label_field=TEXT,
                                    train=train,
                                    test=test)

In [87]:
TEXT.build_vocab(pytrain, pytest, min_freq=2)

tqdm.write("Vocabulary size: {}".format(len(TEXT.vocab)))

Vocabulary size: 233358


In [88]:
BATCH_SIZE  = 1000

In [92]:
train_iter = BucketIterator(pytrain, 
                            batch_size = MINI_BATCH_SIZE,
                            device=None,
                            sort_key=lambda x: len(x.label),
                            sort_within_batch=True,
                            repeat=False,
                            shuffle=True)

In [94]:
HIDDEN = 100
EMBED = 50
VOCAB_SIZE = len(TEXT.vocab)
LR = 0.001

In [95]:
encoder = Encoder(VOCAB_SIZE,EMBED,HIDDEN,bidirec=True)
decoder = Decoder(VOCAB_SIZE,EMBED,HIDDEN*2)

In [96]:
if USE_CUDA:
    tqdm.write("Using CUDA")
    if torch.cuda.device_count() > 1:
        print("Using %d devices" % (torch.cuda.device_count()))
        encoder = nn.DataParallel(encoder)
        decoder = nn.DataParallel(decoder)
    encoder = encoder.cuda()
    decoder = decoder.cuda()
decoder.embedding = encoder.embedding

Using CUDA


In [97]:
loss_function = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>'])
enc_optim = optim.Adam(encoder.parameters(),lr=LR)
dec_optim = optim.Adam(decoder.parameters(),lr=LR)

In [98]:
NUM_EPOCHS = 10

In [102]:
for epoch_idx in trange(NUM_EPOCHS, desc = "Epochs", unit = "epoch"):
    total_loss, total_squared_loss, num_batches = 0.0, 0.0, 0
    for batch in tqdm(train_iter, desc = "Batches", unit = "batch"):
        inputs, lengths = batch.text
        targets, _ = batch.label
        decoding_start = Variable(torch.LongTensor([TEXT.vocab.stoi['<s>']]*targets.size(0))).unsqueeze(1)
        if USE_CUDA:
            inputs = inputs.cuda()
            targets = targets.cuda()
            decoding_start = decoding_start.cuda()

        encoder.zero_grad()
        decoder.zero_grad()
        output,hidden = encoder(inputs,lengths.tolist())
        score = decoder(decoding_start,hidden,targets.size(1),output, len(inputs))

        loss = loss_function(score,targets.view(-1))
        total_loss += loss.data[0]
        total_squared_loss += loss.data[0]**2
        num_batches += 1
        loss.backward()
        enc_optim.step()
        dec_optim.step()
    loss_mean = total_loss / num_batches
    loss_variance = (total_squared_loss - (total_loss**2 / num_batches)) / (num_batches - 1)
    tqdm.write("loss mean: %7.4f, loss variance: %7.4f" % (loss_mean, loss_variance))


Epochs:   0%|          | 0/10 [00:00<?, ?epoch/s]
Batches:   0%|          | 0/7407 [00:00<?, ?batch/s][A
[A


ValueError: too many values to unpack (expected 2)

# Saving the model

In [21]:
torch.save(encoder.state_dict(), "./encoder.model")
torch.save(decoder.state_dict(), "./decoder.model")

In [22]:
encoder.load_state_dict(torch.load("./encoder.model"))
decoder.load_state_dict(torch.load("./decoder.model"))