In [15]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import os
import pickle as pkl

import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from torchnlp.datasets import imdb_dataset
from torchnlp.datasets import penn_treebank_dataset
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
SOS_token = 0
EOS_token = 1
MASKED_token = 2
MAX_LENGTH = 42

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"SOSTOKEN": 0, "EOSTOKEN": 1, "MASKEDTOKEN": 2}
        self.index2word = {0: "SOSTOKEN", 1: "EOSTOKEN", 2: "MASKEDTOKEN"}
        self.word2count = {"SOSTOKEN": 0, "EOSTOKEN": 0, "MASKEDTOKEN": 0}
        
        self.n_words = 3  # Count SOS and EOS and Masked token

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
def unicodeToAscii(s):
    """
    Turn a Unicode string to plain ASCII, thanks to
    https://stackoverflow.com/a/518232/2809427
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):  # Lowercase, trim, and remove non-letter characters
    s = unicodeToAscii(s.lower().strip())
    #s = re.sub(r"([.!?])", r" \1", s)
    #s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"[^a-zA-Z]+", r" ", s)
    s = " ".join(s.split()[:40])
    return s

In [4]:
def readLang(dataset_title):
    """
    Args:
        dataset_title: either 'imdb' or 'ptb'
    """
    print("Reading lines...")
    if dataset_title == 'imdb':
        train = imdb_dataset(train=True, directory='../data/')
        # Read the dataset and split into lines
        lines = [train[ind]['text'].strip() for ind, doc in enumerate(train)]
        # Normalize lines
        lines = [' '.join(["SOSTOKEN", normalizeString(s), "EOSTOKEN"]) for s in lines]
        lang = Lang(dataset_title)
    elif dataset_title == 'ptb':
        raise NotImplementedError
    return lang, lines

In [5]:
def prepareData(dataset_title):
    lang, lines = readLang(dataset_title)
    print("Read %s sentence pairs" % len(lines))
    print("Counting words...")
    for l in lines:
        lang.addSentence(l)
    print("Counted words:")
    print(lang.name, lang.n_words)
    return lang, lines

In [6]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    #indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsForTrain(lang, sentence):
    # mask = generate_mask(len(sentence))
    target_tensor = tensorFromSentence(lang, sentence)
    # transformed_sentence = " ".join(transform_input_with_is_missing_token(sentence.split(), mask))
    #input_tensor = tensorFromSentence(lang, transformed_sentence)
    return target_tensor # , target_tensor

def indexFromTensor(lang, decoder_output):
    return decoder_output.max(0)[1]

In [7]:
class pretrainLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers)
        self.fc = nn.Linear(hidden_size, input_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.lstm(output, hidden)
        output = self.fc(output)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device),
                torch.zeros(1, 1, self.hidden_size, device=device))

In [8]:
MAX_LENGTH = 42 # max(map(lambda x: len(x.split()), imdb_lines)) == 2516

def train(input_tensor, model, model_optimizer, criterion, max_length=MAX_LENGTH):
    model_hidden = model.initHidden()

    model_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    
    model_outputs = torch.zeros(max_length, model.input_size, device=device)

    loss = 0

    for ei in range(input_length - 1):
        model_output, model_hidden = model(
            input_tensor[ei], model_hidden)
        #print(model_output, input_tensor.shape, input_tensor[0].shape)
        loss += criterion(model_output[0], input_tensor[ei + 1])
        model_outputs[ei] = model_output[0]
    
    loss.backward()

    
    model_optimizer.step()

    return loss.item() / input_length

In [9]:
from time import time

In [19]:
def trainIters(model, lang, lines, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    #start = time.time()
    start = time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    model_optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    training_sentences = [tensorFromSentence(lang, lines[0]) for i in range(n_iters)]
    
    criterion = nn.CrossEntropyLoss() 
    
    for number in range(1, n_iters + 1):
        #c_ = time()
        input_tensor = training_sentences[number - 1]
        print(input_tensor.shape)
        #print('Pairs created ...', time() - c_)
        #c_ = time()
        loss = train(input_tensor, model,
                     model_optimizer, criterion)
        #print('Loss is done...', time() - c_)
        #c_ = time()
        print_loss_total += loss
        plot_loss_total += loss

        if number % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, number / n_iters),
                                         number, number / n_iters * 100, print_loss_avg))

        if number % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
    
    
    showPlot(plot_losses)
    return plot_losses

In [11]:
from time import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [12]:
import matplotlib.pyplot as plt
#plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    #print(points)
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.grid()
    plt.show();
    print('plot was shown')

In [16]:
%%time
dataset = 'imdb'
lang_filename = './data/' + dataset + '_lang.pkl'
if os.path.exists(lang_filename):
    with open(lang_filename, 'rb') as file:
        (lang, lines) = pkl.load(file)
else:
    lang, lines = prepareData(dataset)
    with open(lang_filename, 'wb') as file:
        pkl.dump((lang, lines), file)

Wall time: 100 ms


In [18]:
hidden_size = 64
lstm = pretrainLSTM(lang.n_words, hidden_size).to(device)

In [21]:
%%time
trainIters(lstm, lang, lines, 100, print_every=10, plot_every=1)

torch.Size([42, 1])
torch.Size([42, 1])
torch.Size([42, 1])
torch.Size([42, 1])


KeyboardInterrupt: 

In [15]:
result = dict()

In [None]:
from IPython.display import clear_output
from tqdm import tqdm
numbers = [2 ** i for i in range(5, 10)]
for hidden_size in numbers:
    #hidden_size = 64
    print(hidden_size)
    lstm = pretrainLSTM(imdb_lang.n_words, hidden_size).to(device)
    result[hidden_size] = trainIters(lstm, imdb_lang, imdb_lines, 1000, print_every=100, plot_every=1)
    clear_output()

256
0m 58s (- 8m 48s) (100 10%) 1.8451
1m 57s (- 7m 49s) (200 20%) 0.0191
2m 56s (- 6m 51s) (300 30%) 0.0094
3m 55s (- 5m 52s) (400 40%) 0.0061
4m 53s (- 4m 53s) (500 50%) 0.0045
5m 52s (- 3m 54s) (600 60%) 0.0036
6m 52s (- 2m 56s) (700 70%) 0.0029


In [25]:
result.keys()

dict_keys([32, 64, 128, 256, 512])

In [33]:
import pickle as pkl
pkl.dump(result, open('losses.pkl', 'wb'))

In [26]:
new_res = []
number = []
for key in result.keys():
    new_res.append(result[key][-1])
    number.append(key)
    print(key, result[key][-1])

32 0.017981847127278645
64 0.0068881625220889135
128 0.0032731464930943082
256 0.0017632983979724702
512 0.001079922630673363


In [38]:
import numpy as np

In [40]:
number[np.argmin(new_res)]

1024

In [11]:
showPlot(result[512][50:200])

NameError: name 'showPlot' is not defined