In [1]:
from collections import defaultdict
import math
import time
import random
import os, sys

import torch
import torch.nn as nn
from torch.autograd import Variable

from torch.utils.data import Dataset, DataLoader,TensorDataset
torch.manual_seed(1)    # reproducible

<torch._C.Generator at 0x20775221310>

In [2]:
BATCH_SIZE=64
N = 2 # The length of the n-gram
EMB_SIZE = 128 # The size of the embedding

embeddings_location = "embeddings.txt" #the file to write the word embeddings to
labels_location = "labels.txt" #the file to write the labels to

USE_CUDA = torch.cuda.is_available()

# Functions to read in the corpus
# NOTE: We are using data from the Penn Treebank, which is already converted
#       into an easy-to-use format with "<unk>" symbols. If we were using other
#       data we would have to do pre-processing and consider how to choose
#       unknown words, etc.
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            yield [w2i[x] for x in line.strip().split(" ")]

# Read in the data
train = list(read_dataset("../data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)


with open(labels_location, 'w') as labels_file:
    for i in range(nwords):
        labels_file.write(i2w[i] + '\n')

In [3]:

class SKIP(nn.Module):
    def __init__(self, nwords, emb_size):
        super(SKIP, self).__init__()
        self.embedding = nn.Embedding(nwords, emb_size)
        self.linear = nn.Linear(emb_size, nwords)

    def forward(self, words):
        #print(words)
        emb = self.embedding(words)    # 3D Tensor of size [batch_size x 1 x emb_size]
        feat = emb.view(emb.size(0), -1) # 2D Tensor of size [batch_size x emb_size]
        logit = self.linear(feat)           # 2D Tensor of size [batch_size x nwords]

        return logit

In [4]:
# Initialize the model and the optimizer
model = SKIP(nwords=nwords, emb_size=EMB_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

In [5]:
# convert a (nested) list of int into a pytorch tensor
def convert_to_tensor(words):
    var = torch.LongTensor(words)
#     if USE_CUDA:
#         var = var.cuda()
    return var

# convert a (nested) list of int into a pytorch Variable
def convert_to_variable(tensor):
    var = Variable(tensor)
    if USE_CUDA:
        var = var.cuda()

    return var

def convert_to_loader(train):
    all_histories = []
    all_targets = []
    for sent in train:
        if len(sent)==0:
            continue
        for i in range(0,len(sent)):
            for j in range(1,N+1):
                all_histories.append([sent[i]])
                all_histories.append([sent[i]])
                all_targets.append(sent[i-j] if i-j >= 0 else S)
                all_targets.append(sent[i+j] if i+j< len(sent) else S)

    train_dataset = TensorDataset(data_tensor=convert_to_tensor(all_histories), 
                                       target_tensor=convert_to_tensor(all_targets))
    train_loader = DataLoader(
        dataset=train_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               
        num_workers=0
    )
    return train_loader

train_loader=convert_to_loader(train)
dev_loader=convert_to_loader(dev)

In [6]:

last_dev = 1e20
best_dev = 1e20

for ITER in range(5):
    # Perform training
    model.train()
    train_words, train_loss = 0, 0.0
    start = time.time()
    for step, (batch_x, batch_y) in enumerate(train_loader):
        
        batch_x=convert_to_variable(batch_x)
        batch_y=convert_to_variable(batch_y)
        #print(sent)
        logits=model(batch_x)
        my_loss = nn.functional.cross_entropy(logits, batch_y, size_average=False)
        train_loss += my_loss.data[0]
        train_words += len(batch_y)
        optimizer.zero_grad()
        my_loss.backward()
        optimizer.step()
    print("iter %r: train loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), train_words/(time.time()-start)))

    # Evaluate on dev set
    # set the model to evaluation mode
    model.eval()
    dev_words, dev_loss = 0, 0.0
    start = time.time()
    for step,(batch_x, batch_y) in enumerate(dev_loader):
        batch_x=convert_to_variable(batch_x)
        batch_y=convert_to_variable(batch_y)
        logits=model(batch_x)
        my_loss = nn.functional.cross_entropy(logits, batch_y, size_average=False)
        dev_loss += my_loss.data[0]
        dev_words += len(batch_y)

    # Keep track of the development accuracy and reduce the learning rate if it got worse
    if last_dev < dev_loss:
        for g in optimizer.param_groups:
            g['lr']/=2
        print('lr decay')
    last_dev = dev_loss

    # Keep track of the best development accuracy, and save the model only if it's the best one
    if best_dev > dev_loss:
        print("saving embedding files")
        with open(embeddings_location, 'w') as embeddings_file:
            W_w_np = model.embedding.weight.cpu().data.numpy()
            for i in range(nwords):
                ith_embedding = '\t'.join(map(str, W_w_np[i]))
                embeddings_file.write(ith_embedding + '\n')
        best_dev = dev_loss
        
        # Save the model
    print("iter %r: dev loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words/(time.time()-start)))


iter 0: train loss/word=6.6550, ppl=776.6416 (word/sec=21484.70)
saving embedding files
iter 0: dev loss/word=6.4265, ppl=617.9775 (word/sec=40085.39)
iter 1: train loss/word=6.2485, ppl=517.2380 (word/sec=22325.43)
saving embedding files
iter 1: dev loss/word=6.2913, ppl=539.8292 (word/sec=42527.15)
iter 2: train loss/word=6.1147, ppl=452.4713 (word/sec=22567.33)
saving embedding files
iter 2: dev loss/word=6.2490, ppl=517.5196 (word/sec=48141.58)
iter 3: train loss/word=6.0355, ppl=418.0194 (word/sec=24687.85)
saving embedding files
iter 3: dev loss/word=6.2259, ppl=505.6799 (word/sec=44144.88)
iter 4: train loss/word=5.9814, ppl=395.9856 (word/sec=21844.80)
saving embedding files
iter 4: dev loss/word=6.2139, ppl=499.6622 (word/sec=42013.13)
