In [1]:
from collections import defaultdict
import math
import time
import random
import os, sys

import torch
import torch.nn as nn
from torch.autograd import Variable

from torch.utils.data import Dataset, DataLoader,TensorDataset
torch.manual_seed(1)    # reproducible


<torch._C.Generator at 0x1fce098c350>

In [2]:

N = 2 # The length of the n-gram
EMB_SIZE = 128 # The size of the embedding

BATCH_SIZE=64

embeddings_location = "embeddings.txt" #the file to write the word embeddings to
labels_location = "labels.txt" #the file to write the labels to

USE_CUDA = torch.cuda.is_available()

# Functions to read in the corpus
# NOTE: We are using data from the Penn Treebank, which is already converted
#       into an easy-to-use format with "<unk>" symbols. If we were using other
#       data we would have to do pre-processing and consider how to choose
#       unknown words, etc.
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            yield [w2i[x] for x in line.strip().split(" ")]

# Read in the data
train = list(read_dataset("../data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)


with open(labels_location, 'w') as labels_file:
    for i in range(nwords):
        labels_file.write(i2w[i] + '\n')

In [3]:

class CBOW(nn.Module):
    def __init__(self, nwords, emb_size, num_context):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(nwords, emb_size)
        self.linear = nn.Linear(emb_size, nwords)

    def forward(self, words):
        #print(words)
        feat = self.embedding(words).sum(1)      # 2D Tensor of size [batch_size x  emb_size]
        logit = self.linear(feat)           # 2D Tensor of size [batch_size x nwords]

        return logit

In [4]:
# Initialize the model and the optimizer
model = CBOW(nwords=nwords, emb_size=EMB_SIZE,num_context=N)
if USE_CUDA:
    model = model.cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

In [5]:
# convert a (nested) list of int into a pytorch tensor
def convert_to_tensor(words):
    var = torch.LongTensor(words)
#     if USE_CUDA:
#         var = var.cuda()
    return var

# convert a (nested) list of int into a pytorch Variable
def convert_to_variable(tensor):
    var = Variable(tensor)
    if USE_CUDA:
        var = var.cuda()

    return var

def convert_to_loader(train):
    all_histories = []
    all_targets = []
    for sent in train:
        if len(sent)==0:
            continue
        # The initial history is equal to end of sentence symbols
        padded_sent = [S] * N + sent + [S] * N
        # Step through the sentence, including the end of sentence token

        for i in range(N,N+len(sent)):
            all_histories.append(list(padded_sent[i-N:i]+padded_sent[i+1:i+N+1]))
            all_targets.append(padded_sent[i])



    train_dataset = TensorDataset(data_tensor=convert_to_tensor(all_histories), 
                                       target_tensor=convert_to_tensor(all_targets))
    train_loader = DataLoader(
        dataset=train_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               
        num_workers=0
    )
    return train_loader

train_loader=convert_to_loader(train)
dev_loader=convert_to_loader(dev)


In [6]:

last_dev = 1e20
best_dev = 1e20

for ITER in range(20):
    # Perform training
    model.train()
    train_words, train_loss = 0, 0.0
    start = time.time()
    for step, (batch_x, batch_y) in enumerate(train_loader):
        
        batch_x=convert_to_variable(batch_x)
        batch_y=convert_to_variable(batch_y)
        #print(sent)
        logits=model(batch_x)
        my_loss = nn.functional.cross_entropy(logits, batch_y, size_average=False)
        train_loss += my_loss.data[0]
        train_words += len(batch_y)
        optimizer.zero_grad()
        my_loss.backward()
        optimizer.step()
    print("iter %r: train loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), train_words/(time.time()-start)))

    # Evaluate on dev set
    # set the model to evaluation mode
    model.eval()
    dev_words, dev_loss = 0, 0.0
    start = time.time()
    for step,(batch_x, batch_y) in enumerate(dev_loader):
        batch_x=convert_to_variable(batch_x)
        batch_y=convert_to_variable(batch_y)
        logits=model(batch_x)
        my_loss = nn.functional.cross_entropy(logits, batch_y, size_average=False)
        dev_loss += my_loss.data[0]
        dev_words += len(batch_y)

    # Keep track of the development accuracy and reduce the learning rate if it got worse
    if last_dev < dev_loss:
        for g in optimizer.param_groups:
            g['lr']/=2
        print('lr decay')
    last_dev = dev_loss

    # Keep track of the best development accuracy, and save the model only if it's the best one
    if best_dev > dev_loss:
        print("saving embedding files")
        with open(embeddings_location, 'w') as embeddings_file:
            W_w_np = model.embedding.weight.cpu().data.numpy()
            for i in range(nwords):
                ith_embedding = '\t'.join(map(str, W_w_np[i]))
                embeddings_file.write(ith_embedding + '\n')
        best_dev = dev_loss
        
        # Save the model
    print("iter %r: dev loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words/(time.time()-start)))



iter 0: train loss/word=8.3928, ppl=4415.3643 (word/sec=24490.70)
saving embedding files
iter 0: dev loss/word=7.1321, ppl=1251.4689 (word/sec=29014.93)
iter 1: train loss/word=6.4828, ppl=653.7773 (word/sec=20865.78)
saving embedding files
iter 1: dev loss/word=6.6759, ppl=793.0255 (word/sec=25129.78)
iter 2: train loss/word=5.9186, ppl=371.8960 (word/sec=21381.67)
saving embedding files
iter 2: dev loss/word=6.4141, ppl=610.3669 (word/sec=26296.27)
iter 3: train loss/word=5.6017, ppl=270.8930 (word/sec=21153.32)
saving embedding files
iter 3: dev loss/word=6.3974, ppl=600.2663 (word/sec=24867.28)
iter 4: train loss/word=5.3891, ppl=219.0147 (word/sec=20940.79)
saving embedding files
iter 4: dev loss/word=6.2867, ppl=537.3678 (word/sec=24920.16)
iter 5: train loss/word=5.2350, ppl=187.7259 (word/sec=20936.03)
saving embedding files
iter 5: dev loss/word=6.2610, ppl=523.7229 (word/sec=25189.31)
iter 6: train loss/word=5.1154, ppl=166.5752 (word/sec=20524.67)
lr decay
iter 6: dev loss/w