In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import os
import torch
from torch import optim
import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from torch.utils.data import Dataset
from torch.optim import lr_scheduler
import itertools
import glob
plt.switch_backend('agg')
import matplotlib.ticker as ticker

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

__Preprocess Data__

In [2]:
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_token = 3

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "<pad>", 3: "<unk>"}
        self.n_words = 4  # Count SOS, EOS, pad and unk

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [4]:
MAX_LENGTH = 9999

# eng_prefixes = (
#     "i am ", "i m ",
#     "he is", "he s ",
#     "she is", "she s",
#     "you are", "you re ",
#     "we are", "we re ",
#     "they are", "they re "
# )


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH #and \
#         p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [5]:
def readLangs(dataset, lang1, lang2):
    chinese = os.getcwd()+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang1)
    english = os.getcwd()+'/iwslt-zh-en/{}.tok.{}'.format(dataset, lang2)

    chinese_lines = open(chinese, encoding='utf-8').read().strip().split('\n')
    english_lines = open(english, encoding='utf-8').read().strip().split('\n')
    length = len(chinese_lines)

    pairs = [[chinese_lines[i], normalizeString(english_lines[i])] for i in range(length)]
    pairs = filterPairs(pairs)
    
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    return input_lang, output_lang, pairs

In [6]:
train_input_lang, train_output_lang, train_pairs = readLangs('train', 'zh', 'en')
val_input_lang, val_output_lang, val_pairs = readLangs('dev', 'zh', 'en')
test_input_lang, test_output_lang, test_pairs = readLangs('test', 'zh', 'en')

__Data Loader__

In [7]:
class NMTDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, input_lang, output_lang, pairs):
        """
        @param data_list_1: list of sentence 1 tokens 
        @param data_list_2: list of sentence 2 tokens
        @param target_list: list of review targets 

        """
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        input_sentence = self.pairs[key][0]
        input_indexes = [self.input_lang.word2index[word] for word in input_sentence.split(' ')]
        input_indexes.append(EOS_token)
        input_length = len(input_indexes)

        output_sentence = self.pairs[key][1]
        output_indexes = [self.output_lang.word2index[word] for word in output_sentence.split(' ')]
        output_indexes.append(EOS_token)
        output_length = len(output_indexes)
        return [input_indexes, input_length, output_indexes, output_length]

    
def NMTDataset_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    input_ls = []
    output_ls = []
    input_length_ls = []
    output_length_ls = []
    
    for datum in batch:
        input_length_ls.append(datum[1])
        output_length_ls.append(datum[3])
    
    #find max length in each batch
    max_input = sorted(input_length_ls)[-1]
    max_output = sorted(output_length_ls)[-1]
    
    # padding
    for datum in batch:
        padded_vec_input = np.pad(np.array(datum[0]), 
                                  pad_width=((0,max_input-datum[1])), 
                                  mode="constant", constant_values=2).tolist()
        padded_vec_output = np.pad(np.array(datum[2]), 
                                   pad_width=((0,max_output-datum[3])), 
                                   mode="constant", constant_values=2).tolist()
        input_ls.append(padded_vec_input)
        output_ls.append(padded_vec_output)
    return [torch.tensor(torch.from_numpy(np.array(input_ls)), device=device), 
            torch.tensor(input_length_ls, device=device), 
            torch.tensor(torch.from_numpy(np.array(output_ls)), device=device), 
            torch.tensor(output_length_ls, device=device)]

In [16]:
# create pytorch dataloader
batch_size = 1
train_dataset = NMTDataset(train_input_lang, train_output_lang, train_pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size,
                                           collate_fn=NMTDataset_collate_func,
                                           shuffle=True)

val_dataset = NMTDataset(val_input_lang, val_output_lang, val_pairs)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                         batch_size=batch_size,
                                         collate_fn=NMTDataset_collate_func,
                                         shuffle=True)

In [12]:
len(train_pairs)

213376

In [13]:
len(test_pairs)

1397

In [14]:
len(val_pairs)

1261

In [15]:
train_pairs[0]

['深海 海中 的 生命   大卫   盖罗 ', 'life in the deep oceans']

In [9]:
for i in train_loader:
    print(i)
    break


[tensor([[   17,   195,  6773,  ...,     2,     2,     2],
        [  335,    77,    78,  ...,     2,     2,     2],
        [  111, 11461,   910,  ...,     2,     2,     2],
        ...,
        [   50,   181,   858,  ...,     2,     2,     2],
        [   50,  1104,  6158,  ...,     2,     2,     2],
        [37395,     8,     8,  ...,     2,     2,     2]], device='cuda:0'), tensor([12, 17, 18, 11,  9, 33, 30, 22, 14, 15, 15, 23,  8, 13, 17, 26, 27, 27,
        65, 25, 47, 14, 33, 21, 24,  6, 15, 49, 11, 11, 36, 24],
       device='cuda:0'), tensor([[   50,   290,   300,  ...,     2,     2,     2],
        [   30,     6,  3012,  ...,     2,     2,     2],
        [    5,    97,   942,  ...,     2,     2,     2],
        ...,
        [   47,    84,   115,  ...,     2,     2,     2],
        [   30,    47,   422,  ...,     2,     2,     2],
        [10318,   174, 38190,  ...,     2,     2,     2]], device='cuda:0'), tensor([13, 12, 11, 11, 13, 26, 19, 19, 13, 14, 16, 20, 10, 10, 15, 2

#### CNN Encoder

In [6]:
class EncoderCNN(nn.Module):
    def __init__(self, embeddings, emb_size, hidden_size, kernel_dim, batch_size):

        super(EncoderCNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=True)
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=kernel_dim, padding=1)


    def forward(self, inputs):
        # get embedding of words
        embedded = self.embedding(inputs).float()
        
        # perform convolution 1
        hidden = self.conv1(embedded.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, hidden.size(1), hidden.size(-1))

        return hidden

In [5]:

class DecoderRNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, batch_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
#         self.input_size = input_size
        self.batch_size = batch_size
        self.embedding, self.num_embeddings, self.embedding_dim = create_emb_layer(weights_matrix, True)
        self.embedding.weight.requires_grad = False
        self.gru = nn.GRU(self.embedding_dim, hidden_size, batch_first=True, bidirectional = False)
        self.out = nn.Linear(hidden_size, self.num_embeddings)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, hidden):
        seq_len = len(inputs)
        output = self.embedding(inputs).view(self.batch_size, 1, self.embedding_dim)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output.squeeze(1)))
        weights = 0
        return output, hidden, weights


In [11]:
# class DecoderRNN(nn.Module):
#     def __init__(self, hidden_size, output_size):
#         super(DecoderRNN, self).__init__()
#         self.hidden_size = hidden_size

#         self.embedding = nn.Embedding(output_size, hidden_size)
#         self.gru = nn.GRU(hidden_size, hidden_size)
#         self.out = nn.Linear(hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, inputs, hidden):
#         output = self.embedding(inputs).view(1, 1, -1)
#         output = F.relu(output)
#         output, hidden = self.gru(output, hidden)
#         output = self.softmax(self.out(output[0]))
#         return output, hidden

#     def initHidden(self):
#         return torch.zeros(1, 1, self.hidden_size, device=device)

#### Training

In [7]:

def train_cnn(batch_size, inputs, target, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
#     encoder_hidden = encoder.initHidden(batch_size)
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    sentence_length = inputs.size()[1]
    t_sentence_length = target.size()[1]
#     encoder_outputs = torch.zeros([batch_size, sentence_length, 256], device=device)
    
    loss = 0

    encoder_hidden = encoder(inputs)

    decoder_input = torch.tensor(BATCH_SIZE * [[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
#     if use_teacher_forcing:
    for i in range(t_sentence_length):
        d_out, d_hidden, d_weights = decoder(decoder_input, decoder_hidden)
        loss += criterion(d_out.squeeze(1), target[:, i])
        decoder_input = target[:, i]

#     else:
#          for i in range(t_sentence_length):
#             decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs)
#             topv, topi = decoder_output.topk(1)
#             decoder_input = topi.squeeze().detach()  # detach from history as input
#             loss += criterion(decoder_output, target[i])
#             if decoder_input.item() == EOS_token:
#                 break
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / t_sentence_length