In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import os
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from torch.utils.data import Dataset
from torch.optim import lr_scheduler
import itertools
import glob
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import os
import io
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
BATCH_SIZE = 32
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_token = 3
EMB_DIM = 300
LR_RATE = 0.001

__Preprocess Data__

In [3]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "<pad>", 3: "<unk>"}
        self.n_words = 4  # Count SOS, EOS, pad and unk

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [5]:
MAX_LENGTH = 100

# eng_prefixes = (
#     "i am ", "i m ",
#     "he is", "he s ",
#     "she is", "she s",
#     "you are", "you re ",
#     "we are", "we re ",
#     "they are", "they re "
# )


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH #and \
#         p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [6]:
def readLangs(dataset, lang1, lang2):
    chinese = os.getcwd()+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang1)
    english = os.getcwd()+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang2)

    chinese_lines = open(chinese, encoding='utf-8').read().strip().split('\n')
    english_lines = open(english, encoding='utf-8').read().strip().split('\n')
    length = len(chinese_lines)

    pairs = [[chinese_lines[i], normalizeString(english_lines[i])] for i in range(length)]
    pairs = filterPairs(pairs)
    
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    return input_lang, output_lang, pairs

In [7]:
train_input_lang, train_output_lang, train_pairs = readLangs('train', 'zh', 'en')
val_input_lang, val_output_lang, val_pairs = readLangs('dev', 'zh', 'en')
test_input_lang, test_output_lang, test_pairs = readLangs('test', 'zh', 'en')

__Embedding__

In [8]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for index, line in enumerate(fin):
        if index > 50000:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = [float(i) for i in tokens[1:]]
    return data

fname_zh = '/Users/will/Desktop/courses/DS1011NLP/NYU/project/zh/zh.vec'
fname_eng = '/Users/will/Desktop/courses/DS1011NLP/NYU/project/zh/fasttext300d.vec'
f_zh = load_vectors(fname_zh)
f_eng = load_vectors(fname_eng)

In [9]:
'''
Create Chinese weight matrix
'''
emb_dim = EMB_DIM
unique_token_zh= train_input_lang.word2index.keys()
matrix_len_zh = len(unique_token_zh)
weight_zh = np.zeros((matrix_len_zh, emb_dim))
words_zh_f = 0
for i, word in enumerate(unique_token_zh):
    if word in f_zh.keys():
        weight_zh[i] = f_zh[word]
        words_zh_f += 1
    else:
        weight_zh[i] = np.zeros(shape=(emb_dim, ))

In [10]:
'''
Create English weight matrix
'''
unique_token_eng= train_output_lang.word2index.keys()
matrix_len_eng = len(unique_token_eng)
weight_eng = np.zeros((matrix_len_eng, emb_dim))
words_eng_f = 0
for i, word in enumerate(unique_token_eng):
    if word in f_eng.keys():
        weight_eng[i] = f_eng[word]
        words_eng_f += 1
    else:
        weight_eng[i] = np.zeros(shape=(emb_dim, ))

__Data Loader__

In [11]:
class NMTDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, input_lang, output_lang, pairs):
        """
        @param data_list_1: list of sentence 1 tokens 
        @param data_list_2: list of sentence 2 tokens
        @param target_list: list of review targets 

        """
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        input_sentence = self.pairs[key][0]
        input_indexes = [self.input_lang.word2index[word] for word in input_sentence.split(' ')]
        input_indexes.append(EOS_token)
        input_length = len(input_indexes)

        output_sentence = self.pairs[key][1]
        output_indexes = [self.output_lang.word2index[word] for word in output_sentence.split(' ')]
        output_indexes.append(EOS_token)
        output_length = len(output_indexes)
        return [input_indexes, input_length, output_indexes, output_length]

    
def NMTDataset_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    input_ls = []
    output_ls = []
    input_length_ls = []
    output_length_ls = []
    
    for datum in batch:
        input_length_ls.append(datum[1])
        output_length_ls.append(datum[3])
    
    #find max length in each batch
    max_input = sorted(input_length_ls)[-1]
    max_output = sorted(output_length_ls)[-1]
#     print(output_length_ls)
    # padding
    for datum in batch:
        padded_vec_input = np.pad(np.array(datum[0]), 
                                  pad_width=((0,max_input-datum[1])), 
                                  mode="constant", constant_values=2)
        padded_vec_output = np.pad(np.array(datum[2]), 
                                   pad_width=((0,max_output-datum[3])), 
                                   mode="constant", constant_values=2)
       
        input_ls.append(padded_vec_input)
        output_ls.append(padded_vec_output)
    return [torch.tensor(torch.from_numpy(np.array(input_ls)), device=device), 
            torch.tensor(input_length_ls, device=device), 
            torch.tensor(torch.from_numpy(np.array(output_ls)), device=device), 
            torch.tensor(output_length_ls, device=device)]

In [12]:
# create pytorch dataloader
batch_size = BATCH_SIZE
train_dataset = NMTDataset(train_input_lang, train_output_lang, train_pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=NMTDataset_collate_func,
                                           shuffle=True)

val_dataset = NMTDataset(val_input_lang, val_output_lang, val_pairs)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                         batch_size=batch_size,
                                         collate_fn=NMTDataset_collate_func,
                                         shuffle=False)

In [13]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix))
    emb_layer = emb_layer.to(device)
    return emb_layer, num_embeddings, embedding_dim

__Encoder__

In [14]:
class EncoderRNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, batch_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
#         self.input_size = input_size
        self.batch_size = batch_size
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.embedding.weight.requires_grad = False
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True, bidirectional = False)

    def forward(self, inputs):
        inputs = inputs.unsqueeze(1)
        self.hidden = self.initHidden(batch_size)
        embedded = self.embedding(inputs) 
        output, hidden = self.gru(embedded, self.hidden)
        return output, hidden

    def initHidden(self, batch_size):
        self.hidden = torch.randn(1, batch_size, self.hidden_size, device = device)
        return self.hidden


__Decoder Without Attention__

In [19]:

class DecoderRNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, batch_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
#         self.input_size = input_size
        self.batch_size = batch_size
        self.embedding, self.num_embeddings, self.embedding_dim = create_emb_layer(weights_matrix, True)
        self.embedding.weight.requires_grad = False
        self.gru = nn.GRU(self.embedding_dim, hidden_size, batch_first=True, bidirectional = False)
        self.out = nn.Linear(hidden_size, self.num_embeddings)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, hidden, encoder_puts):
        seq_len = len(inputs)
        output = self.embedding(inputs).view(self.batch_size, 1, self.embedding_dim)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output.squeeze(1)))
        weights = 0
        return output, hidden, weights

#     def initHidden(self):
#         return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [20]:
# class DecoderRNN(nn.Module):
#     def __init__(self, hidden_size, output_size):
#         super(DecoderRNN, self).__init__()
#         self.hidden_size = hidden_size

#         self.embedding = nn.Embedding(output_size, hidden_size)
#         self.gru = nn.GRU(hidden_size, hidden_size)
#         self.out = nn.Linear(hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, input, hidden):
#         output = self.embedding(input).view(1, 1, -1)
#         output = F.relu(output)
#         output, hidden = self.gru(output, hidden)
#         output = self.softmax(self.out(output[0]))
#         return output, hidden

#     def initHidden(self):
#         return torch.zeros(1, 1, self.hidden_size, device=device)

In [21]:
d_out

NameError: name 'd_out' is not defined

__Decoder With Attention__

In [22]:
class BahdanauAttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1):
        super(BahdanauAttnDecoderRNN, self).__init__()
        
        # Define parameters
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        
        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.attn = nn.Linear(hidden_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, word_input, last_hidden, encoder_outputs):
      
        word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
        word_embedded = self.dropout(word_embedded)

        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        
        rnn_input = torch.cat((word_embedded, context), 2)
        output, hidden = self.gru(rnn_input, last_hidden)
        # Final output layer
        output = output.squeeze(0) # B x N
        output = F.log_softmax(self.out(output))
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

__Training__

In [31]:

def train(batch_size, inputs, target, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    encoder_hidden = encoder.initHidden(batch_size)
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    sentence_length = inputs.size()[1]
    t_sentence_length = target.size()[1]
    print(inputs.size(), target.size())
    encoder_outputs = torch.zeros([batch_size, sentence_length, 256], device=device)
    
    loss = 0
    
    for i in range(sentence_length):
        output, encoder_hidden = encoder(inputs[:, i])
        encoder_outputs[:, i, :] = output[0, 0]

    decoder_input = torch.tensor(BATCH_SIZE * [[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
#     if use_teacher_forcing:
    for i in range(t_sentence_length):
        d_out, d_hidden, d_weights = decoder(decoder_input, decoder_hidden, encoder_outputs)
        loss += criterion(d_out.squeeze(1), target[:, i])
        decoder_input = target[:, i]

#     else:
#          for i in range(t_sentence_length):
#             decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs)
#             topv, topi = decoder_output.topk(1)
#             decoder_input = topi.squeeze().detach()  # detach from history as input
#             loss += criterion(decoder_output, target[i])
#             if decoder_input.item() == EOS_token:
#                 break
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / t_sentence_length

In [32]:
teacher_forcing_ratio = 0.5
criterion = nn.NLLLoss()
encoder = EncoderRNN(weight_zh, 256, BATCH_SIZE)
decoder = DecoderRNN(weight_eng, 256, BATCH_SIZE)
loss = 0
encoder_optimizer = optim.SGD(encoder.parameters(), lr=LR_RATE)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=LR_RATE)
for i, (inputs, i_l, target, t_l) in enumerate(train_loader):
    loss = train(32, inputs, target, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
    print(loss)

torch.Size([32, 77]) torch.Size([32, 53])
10.80436850493809
torch.Size([32, 83]) torch.Size([32, 58])
10.736134496228448
torch.Size([32, 93]) torch.Size([32, 67])
10.693172625641324
torch.Size([32, 71]) torch.Size([32, 55])
10.675522682883523
torch.Size([32, 61]) torch.Size([32, 51])
10.624921013327207
torch.Size([32, 57]) torch.Size([32, 40])
10.663507843017578
torch.Size([32, 54]) torch.Size([32, 40])
10.650848388671875
torch.Size([32, 63]) torch.Size([32, 60])
10.557437133789062
torch.Size([32, 64]) torch.Size([32, 58])
10.521724306303879
torch.Size([32, 92]) torch.Size([32, 78])
10.474610157502003
torch.Size([32, 97]) torch.Size([32, 62])
10.478365005985383
torch.Size([32, 65]) torch.Size([32, 50])
10.467740478515625
torch.Size([32, 78]) torch.Size([32, 73])
10.369771879013271
torch.Size([32, 99]) torch.Size([32, 81])
10.275218068817516
torch.Size([32, 93]) torch.Size([32, 77])
10.208250367796266
torch.Size([32, 64]) torch.Size([32, 50])
10.29320068359375
torch.Size([32, 56]) torch

KeyboardInterrupt: 

In [None]:
# teacher_forcing_ratio = 0.5

# def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
#     encoder_hidden = encoder.initHidden()

#     encoder_optimizer.zero_grad()
#     decoder_optimizer.zero_grad()

#     input_length = input_tensor.size()[1]
#     target_length = target_tensor.size()[1]

#     encoder_outputs = torch.zeros(batch_size, input_length, encoder.hidden_size, device=device)

#     loss = 0

#     for i in range(input_length):
#         encoder_output, encoder_hidden = encoder(
#             input_tensor[:, i])
#         encoder_outputs[:, i, :] = encoder_output[0, 0]

#     decoder_input = torch.tensor([[SOS_token]], device=device)

#     decoder_hidden = encoder_hidden

#     use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

#     if use_teacher_forcing:
#         # Teacher forcing: Feed the target as the next input
#         for di in range(target_length):
#             decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs)
#             loss += criterion(decoder_output, target_tensor[di])
#             decoder_input = target_tensor[di]  # Teacher forcing

#     else:
#         # Without teacher forcing: use its own predictions as the next input
#         for di in range(target_length):
#             decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs)
#             topv, topi = decoder_output.topk(1)
#             decoder_input = topi.squeeze().detach()  # detach from history as input

#             loss += criterion(decoder_output, target_tensor[di])
#             if decoder_input.item() == EOS_token:
#                 break

#     loss.backward()

#     encoder_optimizer.step()
#     decoder_optimizer.step()

#     return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(data_loader, encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
#     training_pairs = [tensorsFromPair(random.choice(pairs))
#                       for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        for i, (inputs, inputs_length, targets, targets_length) in enumerate(data_loader):
#         training_pair = training_pairs[iter - 1]
#         input_tensor = training_pair[0]
#         target_tensor = training_pair[1]
            input_tensor = inputs
            target_tensor = targets
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                             iter, iter / n_iters * 100, print_loss_avg))

            if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

#     showPlot(plot_losses)

In [None]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(train_input_lang.n_words, hidden_size).to(device)
# attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
#attn_decoder1 = BahdanauAttnDecoderRNN(hidden_size, output_lang.n_words, n_layers=1, dropout_p=0.1).to(device)
decoder1 = DecoderRNN(hidden_size, train_output_lang.n_words).to(device)
##UNCOMMENT TO TRAIN THE MODEL
# trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
trainIters(train_loader, encoder1, decoder1, n_iters = 10, print_every=1000, plot_every=100, learning_rate=0.01)
# encoder1.load_state_dict(torch.load("encoder.pth"))
# attn_decoder1.load_state_dict(torch.load("attn_decoder.pth"))

In [None]:
# torch.save(encoder1.state_dict(), "encoder.pth")
# torch.save(attn_decoder1.state_dict(), "attn_decoder.pth")

In [None]:
evaluateRandomly(encoder1, noattn_decoder1)