In [1]:
import numpy as np
from tqdm import tqdm
from pandas import read_csv
from tqdm import tnrange, tqdm_notebook
from nltk import word_tokenize

import torch
from torch import optim
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [2]:
with open('tolstoy_anna.txt', 'r') as f:
    txt = f.readlines()
txt = [x.strip() for x in txt]
txt_str = ''
for t in txt:
    txt_str += ' {}'.format(t.lower())

In [3]:
tokens = word_tokenize(txt_str)

In [4]:
# word2idx and idx2word setup
unique_tokens = set(tokens)
w2x = {word: idx for (idx, word) in enumerate(unique_tokens)}
x2w = {idx: word for (idx, word) in enumerate(unique_tokens)}
indices = [w2x[w] for w in tokens]

In [5]:
vocab_size = len(unique_tokens)

In [6]:
# generate training data
window = 2
train_data = []
for idx in range(len(indices)):
    for r in range(-window, window + 1):
        cxt = idx + r
        if not ((cxt < 0) or (cxt >= len(indices)) or (idx == cxt)):
            train_data.append([indices[idx], indices[cxt]])
train_data = np.array(train_data)
train_data = torch.LongTensor(train_data)

In [7]:
# sanity check
for [x, y] in train_data[2100:2120]:
    print(x2w[int(x)], x2w[int(y)])
# clean memory
# del indices
# del tokens

serge ,
curtains the
curtains serge
curtains ,
curtains he
, serge
, curtains
, he
, cheerfully
he curtains
he ,
he cheerfully
he dropped
cheerfully ,
cheerfully he
cheerfully dropped
cheerfully his
dropped he
dropped cheerfully
dropped his


In [36]:
# Continuous Bag-of-Words Model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 context_size, batch_size):
        super(CBOW, self).__init__()
        self.batch_size = batch_size
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.out = nn.Softmax(dim=2)

    def forward(self, x):
        x = self.embed(x).view(self.batch_size, 1, -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return self.out(x).squeeze()

model = CBOW(vocab_size=vocab_size, embedding_dim=100, hidden_dim=128,
             context_size=2, batch_size=128)

In [41]:
def one_hot(idx_batch):
    one_hot_mat = torch.zeros((len(idx_batch), vocab_size)).float()
    indices = torch.LongTensor(idx_batch).view(-1, 1)
    one_hot_mat.scatter_(1, indices, 1.0)
    return one_hot_mat


def mat_loss(pred, gt):
    delta = torch.exp(pred - Variable(gt))
    norm = torch.norm(delta, p=2, dim=1)
    return (torch.sum(norm) / gt.shape[1])


def batchify(data, batch_size, use_cuda=False):
    rm_size = len(data) % batch_size
    x = data[:-rm_size, 0].contiguous()
    y = data[:-rm_size, 1].contiguous()
    if use_cuda:
        x = x.view(-1, batch_size).cuda()
    else:
        x = x.view(-1, batch_size)
    y = y.view(-1, batch_size)
    return x, y

In [42]:
x, y = batchify(train_data, batch_size=128, use_cuda=False)

In [43]:
def train(x_train, y_train, num_epochs, use_cuda=False):
    loss_fn = mat_loss
    optimizer = optim.SGD(model.parameters(), lr=1e-3)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=1000,
                                          gamma=0.9)
    x_train = Variable(x_train)
    for epoch in tnrange(num_epochs, desc='epoch'):
        total_loss = 0
        for batch_idx in tqdm_notebook(range(x_train.shape[0]),
                                       desc='batch', leave=False):
            x = x_train[batch_idx, :]
            y = y_train[batch_idx, :]
            model.zero_grad()
            log_prob = model(x)
            gt = one_hot(y)
            loss = loss_fn(log_prob, gt)
            loss.backward()
            scheduler.step()
            total_loss += loss.data
            if batch_idx % 2000 == 0:
                num_seen = (batch_idx + 1) * x_train.shape[1]
                l = np.log(float(total_loss / num_seen))
                print("BATCH: {}/{} | AVG LOG LOSS: {}".format(batch_idx + 1,
                                                           x_train.shape[0],
                                                           l))
            if batch_idx % (x_train.shape[0] // 4) == 0:
                torch.save(model.state_dict(), 'models/model_{}.pt'.format(epoch+5))
                print("Successfully saved model")

In [44]:
train(x, y, num_epochs=10, use_cuda=False)

HBox(children=(IntProgress(value=0, description='epoch', max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, description='batch', max=13463), HTML(value='')))

BATCH: 1/13463 | AVG LOG LOSS: -4.810524642346785
Successfully saved model
BATCH: 2001/13463 | AVG LOG LOSS: -4.810536536139335
Successfully saved model
BATCH: 4001/13463 | AVG LOG LOSS: -4.8104819859658265
BATCH: 6001/13463 | AVG LOG LOSS: -4.810463803235908
Successfully saved model
BATCH: 8001/13463 | AVG LOG LOSS: -4.810463002745963
BATCH: 10001/13463 | AVG LOG LOSS: -4.810549573728305
Successfully saved model
BATCH: 12001/13463 | AVG LOG LOSS: -4.810607215632338
Successfully saved model


HBox(children=(IntProgress(value=0, description='batch', max=13463), HTML(value='')))

BATCH: 1/13463 | AVG LOG LOSS: -4.810524642346785
Successfully saved model



KeyboardInterrupt: 

In [35]:
ls models

model_0.0.pt  model_10.pt   model_13.pt   model_3.pt    model_7.pt
model_0.pt    model_11.pt   model_14.pt   model_5.pt    model_8.pt
model_1.pt    model_12.pt   model_2.pt    model_6.pt    model_9.pt
