In [1]:
from collections import defaultdict
import math
import time
import random
import os, sys

import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np

In [2]:
K=3 #number of negative samples
N = 2 # The length of the n-gram
EMB_SIZE = 128 # The size of the embedding

embeddings_location = "embeddings.txt" #the file to write the word embeddings to
labels_location = "labels.txt" #the file to write the labels to

USE_CUDA = torch.cuda.is_available()

# Functions to read in the corpus
# NOTE: We are using data from the Penn Treebank, which is already converted
#       into an easy-to-use format with "<unk>" symbols. If we were using other
#       data we would have to do pre-processing and consider how to choose
#       unknown words, etc.
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]

#word counts for negative sampling
word_counts = defaultdict(int)

def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            for word in line:
                word_counts[w2i[word]] += 1
            yield [w2i[x] for x in line.strip().split(" ")]

# Read in the data
train = list(read_dataset("../data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)


# take the word counts to the 3/4, normalize
counts =  np.array([list(x) for x in word_counts.items()])[:,1]**.75
normalizing_constant = sum(counts)
word_probabilities = np.zeros(nwords)
for word_id in word_counts:
    word_probabilities[word_id] = word_counts[word_id]**.75/normalizing_constant


with open(labels_location, 'w') as labels_file:
    for i in range(nwords):
        labels_file.write(i2w[i] + '\n')

In [3]:

class SKIP_NS(nn.Module):
    def __init__(self, nwords, emb_size):
        super(SKIP_NS, self).__init__()
        self.embedding = nn.Embedding(nwords, emb_size)

    def forward(self, words,contexts):
        #print(words)
        emb1 = self.embedding(words)    # 3D Tensor of size [batch_size x x emb_size]
        emb2 = self.embedding(contexts)    # 3D Tensor of size [batch_size x x  emb_size]
        #print(emb1*emb2)
        return (emb1*emb2).sum(1)

In [4]:
# Initialize the model and the optimizer
model = SKIP_NS(nwords=nwords, emb_size=EMB_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [5]:
# convert a (nested) list of int into a pytorch Variable
def convert_to_variable(words):
    var = Variable(torch.LongTensor(words))
    if USE_CUDA:
        var = var.cuda()

    return var

# A function to calculate scores for one value
def calc_score_of_histories(words,contexts):
    # This will change from a list of histories, to a pytorch Variable whose data type is LongTensor
    words_var = convert_to_variable(words)
    contexts_var=convert_to_variable(contexts)
    emb = model(words_var,contexts_var)
    return emb

# Calculate the loss value for the entire sentence
def calc_sent_loss(sent):
    # The initial history is equal to end of sentence symbols
    # Step through the sentence, including the end of sentence token
    loss=[]
    all_neg_words = np.random.choice(nwords, size=2*N*K*len(sent), replace=True, p=word_probabilities).tolist()
    all_histories = []
    all_contexts = []
    all_targets = []

    for i in range(0,len(sent)):
        neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N]
        pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] +
                 [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)])
        for pos_word in pos_words:
            all_histories.append(sent[i])
            all_contexts.append(pos_word)
            all_targets.append(1)
        for neg_word in neg_words:
            all_histories.append(sent[i])
            all_contexts.append(neg_word)          
            all_targets.append(0)
    all_targets=Variable(torch.FloatTensor(all_targets))
    if USE_CUDA:
        all_targets = all_targets.cuda()    
    predictions=calc_score_of_histories(all_histories,all_contexts)
    loss=nn.functional.binary_cross_entropy_with_logits(predictions, all_targets,
                                                        size_average=False)
    return loss

In [6]:
last_dev = 1e20
best_dev = 1e20

for ITER in range(2):
  # Perform training
    random.shuffle(train)
    # set the model to training mode
    model.train()
    train_words, train_loss = 0, 0.0
    start = time.time()
    for sent_id, sent in enumerate(train):
        if len(sent)<=1:
            continue
        #print(sent)
        my_loss = calc_sent_loss(sent)
        train_loss += my_loss.data[0]
        train_words += len(sent)
        optimizer.zero_grad()
        my_loss.backward()
        optimizer.step()
        if (sent_id+1) % 5000 == 0:
            print("--finished %r sentences (word/sec=%.2f)" % (sent_id+1, train_words/(time.time()-start)))
    print("iter %r: train loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), train_words/(time.time()-start)))

    # Evaluate on dev set
    # set the model to evaluation mode
    model.eval()
    dev_words, dev_loss = 0, 0.0
    start = time.time()
    for sent_id, sent in enumerate(dev):
        if len(sent)<=1:
            continue
        my_loss = calc_sent_loss(sent)
        dev_loss += my_loss.data[0]
        dev_words += len(sent)

    # Keep track of the development accuracy and reduce the learning rate if it got worse
    if last_dev < dev_loss:
        optimizer.learning_rate /= 2
    last_dev = dev_loss

    # Keep track of the best development accuracy, and save the model only if it's the best one
    if best_dev > dev_loss:
        print("saving embedding files")
        with open(embeddings_location, 'w') as embeddings_file:
            W_w_np = model.embedding.weight.cpu().data.numpy()
            for i in range(nwords):
                ith_embedding = '\t'.join(map(str, W_w_np[i]))
                embeddings_file.write(ith_embedding + '\n')
        best_dev = dev_loss
        
        # Save the model
    print("iter %r: dev loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words/(time.time()-start)))


--finished 5000 sentences (word/sec=5091.30)
--finished 10000 sentences (word/sec=4980.98)
--finished 15000 sentences (word/sec=5033.31)
--finished 20000 sentences (word/sec=5165.99)
--finished 25000 sentences (word/sec=5264.72)
--finished 30000 sentences (word/sec=5334.61)
--finished 35000 sentences (word/sec=5367.16)
--finished 40000 sentences (word/sec=5406.64)
iter 0: train loss/word=7.5499, ppl=1900.6468 (word/sec=5421.99)
saving embedding files
iter 0: dev loss/word=1.8474, ppl=6.3434 (word/sec=12475.24)
--finished 5000 sentences (word/sec=5410.35)
--finished 10000 sentences (word/sec=5436.65)
--finished 15000 sentences (word/sec=5427.73)
--finished 20000 sentences (word/sec=5387.49)
--finished 25000 sentences (word/sec=5351.41)
--finished 30000 sentences (word/sec=5370.89)
--finished 35000 sentences (word/sec=5355.06)
--finished 40000 sentences (word/sec=5373.57)
iter 1: train loss/word=1.1102, ppl=3.0351 (word/sec=5374.96)
saving embedding files
iter 1: dev loss/word=1.2780, pp