# Using short QA pairs, we can achieve surprising results on the Cornell Dataset.

## Import train/val sets

In [None]:
import nltk 
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [1]:
from process_cornell import ENCODING
from process_cornell import TRAIN_PATH
from process_cornell import VAL_PATH
from utils import load_data

train = load_data(ENCODING, TRAIN_PATH)
val = load_data(ENCODING, VAL_PATH)

## Create the vocabulary

In [2]:
from collections import Counter
from itertools import chain
from vocab import Vocab


max_vocab_size = 20000

tokens = []
for q,a in train:
    tokens.extend(chain(q, a))
    
counts = Counter(tokens)
most_common = [token for token, count in counts.most_common(max_vocab_size)]
vocab = Vocab()
for token in most_common:
    vocab.add_token(token)

## Convert strings to label encoded sequences

In [3]:
for i in range(len(train)):
    q, a = train[i]
    q.append(Vocab.EOS_TOKEN)
    a.append(Vocab.EOS_TOKEN)
    a.insert(0, Vocab.SOS_TOKEN)
    train[i] = (vocab.label_encode(q), vocab.label_encode(a))

for i in range(len(val)):
    q, a = val[i]
    q.append(Vocab.EOS_TOKEN)
    a.append(Vocab.EOS_TOKEN)
    a.insert(0, Vocab.SOS_TOKEN)
    val[i] = (vocab.label_encode(q), vocab.label_encode(a))

## Train Model

In [4]:
import time
import random

import numpy as np
import torch.cuda
from torch.optim import Adam

from models import NCM
from vocab import Vocab
from train import get_loss


torch.cuda.manual_seed_all(42)

print_iters = 100
iters = 15000

batch_size = 64
hidden_size = 256
embedding_size = 32
num_layers = 2


start_time = time.time()

model = NCM(len(vocab), embedding_size, hidden_size, num_layers).cuda()
optimizer = Adam(model.parameters())
torch.save(model.state_dict(), 'chat.init')

train_losses = []
val_losses = []

iter_start_time = time.time()
for i in range(1, iters + 1):
    train_batch = [random.choice(train) for _ in range(batch_size)]
    val_batch = [random.choice(val) for _ in range(batch_size)]

    train_loss = get_loss(model, train_batch)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    val_loss = get_loss(model, val_batch, inference_only=True)

    train_losses.append(train_loss.data[0])
    val_losses.append(val_loss.data[0])

    if i % print_iters == 0:
        iter_end_time = time.time()

        avg_train_loss = sum(train_losses[-print_iters:]) / print_iters
        avg_val_loss = sum(val_losses[-print_iters:]) / print_iters

        epoch = (batch_size * i) / len(train)

        string = 'epoch: {}, iters: {}, train loss: {:.2f}, val loss: {:.2f}, time: {:.2f} s'
        print(string.format(epoch, i, avg_train_loss, avg_val_loss, iter_end_time - iter_start_time))

        iter_start_time = time.time()
        
    if i == 2000:  # val minimum
        torch.save(model.state_dict(), 'chat.min')

torch.save(model.state_dict(), 'chat.final')

end_time = time.time()
seconds_per_hour = 60.**2
print('\nTotal time: {:.2f} hours\n'.format((end_time - start_time) / seconds_per_hour))

  result[i] = self.softmax(linear_output[i])
  questions = Variable(torch.LongTensor(questions), volatile=inference_only)
  answers = Variable(torch.LongTensor(answers), volatile=inference_only)


KeyboardInterrupt: 

## Plot losses

In [58]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()

p = figure(y_axis_label='perplexity', x_axis_label='iters')
p.line(range(iters), train_losses, legend='train')
p.line(range(iters), val_losses, legend='val', color='orange')
show(p)