In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from allennlp.modules.elmo import Elmo

from encoder import EncoderRNN
from decoder import LuongAttnDecoderRNN
from movie_line_process import loadLines, loadConversations, extractSentencePairs
from voc import loadPrepareData, trimRareWords, normalizeString, makeVoc
from voc import MIN_COUNT, MAX_INPUT_LENGTH, MAX_OUTPUT_LENGTH, PAD_token, SOS_token, EOS_token
from prepare_data import indexesFromSentence, batch2TrainData
from train import trainIters
from model_config import model_name, attn_model, hidden_size
from model_config import encoder_n_layers, decoder_n_layers, dropout, batch_size, embedding_size
from model_config import device, loadFilename, checkpoint_iter
from model_config import save_dir, corpus_name, use_glove, use_elmo
from train_config import clip, learning_rate, decoder_learning_ratio, n_iteration
from train_config import print_every, save_every, evaluate_every
from evaluate import GreedySearchDecoder, evaluateInput, dev_evaluate

from squad_loader import prepare_par_pairs, prepare_sent_pairs
from squad_loader import prepare_ans_sent_pairs
from squad_loader import process_file

from glove_loader import make_weights

In [None]:
#corpus_name = "squad"
corpus = os.path.join("data", corpus_name)

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

#printLines(os.path.join(corpus, "train-v2.0.json"))

In [None]:
# Define path to new file
datafile = os.path.join(corpus, "formatted_train_squad_qa.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Write new csv file
print("\nWriting newly formatted file...")
data = process_file("train-v2.0.json")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n\n')
    pairs = prepare_ans_sent_pairs(data)
    for pair in pairs:
        writer.writerow(pair)
    
# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)

In [None]:
# Load/Assemble voc and pairs
pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
voc = makeVoc(corpus_name)
# Print some pairs to validate
'''
print("\npairs:")
for pair in pairs[:10]:
    print(pair)
print(pairs[-1])
'''

In [None]:
#print(voc.num_words)
# Trim voc
#pairs = trimRareWords(voc, pairs, MIN_COUNT)
voc.trim(MIN_COUNT)
#print(voc.index2word[14274])

In [None]:
# Example for validation
small_batch_size = 5
test_pairs = []
for _ in range(small_batch_size):
    test_pairs.append(random.choice(pairs))
#test_pairs = random.choice(pairs) for _ in range(small_batch_size)
batches = batch2TrainData(voc, test_pairs)
input_variable, lengths, target_variable, mask, max_target_len, answer_mask = batches
print(input_variable.size())
'''
print("input_variable:", input_variable)
print("answer mask:", answer_mask)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)
print(input_variable.size())
'''
input_variable = input_variable.to(device)
print(input_variable)
embedding(input_variable)

In [None]:
# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, embedding_size)
if loadFilename:
    print("Loading model...")
    embedding.load_state_dict(embedding_sd)
else:
    if use_glove:
        embedding.weight.data = torch.Tensor(make_weights(300, "data/glove/glove.6B.300d.txt", voc))
    elif use_elmo:
        print("Make ELMO embeddings...")
        options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
        weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
        embedding = Elmo(options_file, weight_file, 1, dropout=0)
        embedding_size = 1024
# Initialize encoder & decoder models
encoder = EncoderRNN(embedding_size, hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, embedding_size, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder, voc)

print('Models built and ready to go!')

In [None]:
#Prepare development set
dev_data = process_file("dev-v2.0.json")
dev_pairs = prepare_ans_sent_pairs(dev_data)
print(len(dev_pairs))
dev_pairs = dev_pairs[:500]

random.Random(512).shuffle(dev_pairs)


#dev_evaluate(encoder, decoder, dev_pairs, searcher, voc)

In [None]:
# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterationsjjjjjj
print("Starting Training!")
trainIters(model_name, voc, pairs, dev_pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, evaluate_every, clip, corpus_name, loadFilename, searcher)

In [None]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc)

In [None]:
%reset_selective GreedySearchDecoder

In [None]:
text1 = "what river originally bounded the duchy"
text2 = "what river"

input1 = text1.split(' ')
input2 = text2.split(' ')

print(sentence_bleu([input1], input2, weights=[1]))

In [None]:
print(voc.index2word[10489])