In [1]:
from collections import defaultdict
import math
import time
import random
import os, sys

import torch
import torch.nn as nn
from torch.autograd import Variable

In [2]:

N = 2 # The length of the n-gram
EMB_SIZE = 128 # The size of the embedding

embeddings_location = "embeddings.txt" #the file to write the word embeddings to
labels_location = "labels.txt" #the file to write the labels to

USE_CUDA = torch.cuda.is_available()

# Functions to read in the corpus
# NOTE: We are using data from the Penn Treebank, which is already converted
#       into an easy-to-use format with "<unk>" symbols. If we were using other
#       data we would have to do pre-processing and consider how to choose
#       unknown words, etc.
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            yield [w2i[x] for x in line.strip().split(" ")]

# Read in the data
train = list(read_dataset("../data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)


with open(labels_location, 'w') as labels_file:
    for i in range(nwords):
        labels_file.write(i2w[i] + '\n')

In [3]:

class CBOW(nn.Module):
    def __init__(self, nwords, emb_size, num_context):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(nwords, emb_size)
        self.linear = nn.Linear(emb_size, nwords)

    def forward(self, words):
        #print(words)
        feat = self.embedding(words).sum(1)      # 3D Tensor of size [batch_size x  emb_size]
        #feat = emb.view(emb.size(0), -1) # 2D Tensor of size [batch_size x (2 x num_context*emb_size)]
        logit = self.linear(feat)           # 2D Tensor of size [batch_size x nwords]

        return logit

In [4]:
# Initialize the model and the optimizer
model = CBOW(nwords=nwords, emb_size=EMB_SIZE,num_context=N)
if USE_CUDA:
    model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [5]:
# convert a (nested) list of int into a pytorch Variable
def convert_to_variable(words):
    var = Variable(torch.LongTensor(words))
    if USE_CUDA:
        var = var.cuda()

    return var

# A function to calculate scores for one value
def calc_score_of_histories(words):
    # This will change from a list of histories, to a pytorch Variable whose data type is LongTensor
    words_var = convert_to_variable(words)
    logits = model(words_var)
    return logits

# Calculate the loss value for the entire sentence
def calc_sent_loss(sent):
    # The initial history is equal to end of sentence symbols
    padded_sent = [S] * N + sent + [S] * N
    # Step through the sentence, including the end of sentence token
    all_histories = []
    all_targets = []
    for i in range(N,N+len(sent)):
        all_histories.append(list(padded_sent[i-N:i]+padded_sent[i+1:i+N+1]))
        all_targets.append(padded_sent[i])
    #print(all_histories)
    logits = calc_score_of_histories(all_histories)
    loss = nn.functional.cross_entropy(logits, convert_to_variable(all_targets), size_average=False)

    return loss

In [6]:

last_dev = 1e20
best_dev = 1e20

for ITER in range(2):
  # Perform training
    random.shuffle(train)
    # set the model to training mode
    model.train()
    train_words, train_loss = 0, 0.0
    start = time.time()
    for sent_id, sent in enumerate(train):
        if len(sent)==0:
            continue
        #print(sent)
        my_loss = calc_sent_loss(sent)
        train_loss += my_loss.data[0]
        train_words += len(sent)
        optimizer.zero_grad()
        my_loss.backward()
        optimizer.step()
        if (sent_id+1) % 5000 == 0:
            print("--finished %r sentences (word/sec=%.2f)" % (sent_id+1, train_words/(time.time()-start)))
    print("iter %r: train loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), train_words/(time.time()-start)))

    # Evaluate on dev set
    # set the model to evaluation mode
    model.eval()
    dev_words, dev_loss = 0, 0.0
    start = time.time()
    for sent_id, sent in enumerate(dev):
        my_loss = calc_sent_loss(sent)
        dev_loss += my_loss.data[0]
        dev_words += len(sent)

    # Keep track of the development accuracy and reduce the learning rate if it got worse
    if last_dev < dev_loss:
        optimizer.learning_rate /= 2
    last_dev = dev_loss

    # Keep track of the best development accuracy, and save the model only if it's the best one
    if best_dev > dev_loss:
        print("saving embedding files")
        with open(embeddings_location, 'w') as embeddings_file:
            W_w_np = model.embedding.weight.cpu().data.numpy()
            for i in range(nwords):
                ith_embedding = '\t'.join(map(str, W_w_np[i]))
                embeddings_file.write(ith_embedding + '\n')
        best_dev = dev_loss
        
        # Save the model
    print("iter %r: dev loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words/(time.time()-start)))



--finished 5000 sentences (word/sec=3498.61)
--finished 10000 sentences (word/sec=3560.98)
--finished 15000 sentences (word/sec=3582.08)
--finished 20000 sentences (word/sec=3573.47)
--finished 25000 sentences (word/sec=3570.96)
--finished 30000 sentences (word/sec=3571.83)
--finished 35000 sentences (word/sec=3571.88)
--finished 40000 sentences (word/sec=3596.20)
iter 0: train loss/word=5.9816, ppl=396.0550 (word/sec=3592.03)
saving embedding files
iter 0: dev loss/word=5.7494, ppl=313.9921 (word/sec=13824.89)
--finished 5000 sentences (word/sec=3598.42)
--finished 10000 sentences (word/sec=3620.76)
--finished 15000 sentences (word/sec=3617.77)
--finished 20000 sentences (word/sec=3610.23)
--finished 25000 sentences (word/sec=3598.95)
--finished 30000 sentences (word/sec=3596.11)
--finished 35000 sentences (word/sec=3590.87)
--finished 40000 sentences (word/sec=3590.84)
iter 1: train loss/word=5.4123, ppl=224.1407 (word/sec=3591.32)
saving embedding files
iter 1: dev loss/word=5.5907,