In [1]:
from collections import defaultdict
import math
import time
import random
import os, sys

import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np


from torch.utils.data import Dataset, DataLoader,TensorDataset
torch.manual_seed(1)    # reproducible

<torch._C.Generator at 0x1db070813f0>

In [2]:

N = 2 # The length of the n-gram
EMB_SIZE = 64 # The size of the embedding
BATCH_SIZE=512


embeddings_location = "embeddings.txt" #the file to write the word embeddings to
labels_location = "labels.txt" #the file to write the labels to

USE_CUDA = torch.cuda.is_available()

# Functions to read in the corpus
# NOTE: We are using data from the Penn Treebank, which is already converted
#       into an easy-to-use format with "<unk>" symbols. If we were using other
#       data we would have to do pre-processing and consider how to choose
#       unknown words, etc.
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            yield [w2i[x] for x in line.strip().split(" ")]

# Read in the data
train = list(read_dataset("../data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)
nbits = len(np.binary_repr(nwords-1))

with open(labels_location, 'w') as labels_file:
    for i in range(nwords):
        labels_file.write(i2w[i] + '\n')


In [3]:

class SKIP_BINARY(nn.Module):
    def __init__(self, nwords, emb_size,nbits):
        super(SKIP_BINARY, self).__init__()
        self.embedding = nn.Embedding(nwords, emb_size)
        self.linear = nn.Linear(emb_size, nbits)

    def forward(self, words):
        #print(words)
        emb = self.embedding(words)    # 2D Tensor of size [batch_size x emb_size]
        feat = emb.view(emb.size(0), -1) # 2D Tensor of size [batch_size x (2 x num_context*emb_size)]
        logit = self.linear(feat)           # 2D Tensor of size [batch_size x nbits]
        #print(logit)
        return logit

In [9]:
# Initialize the model and the optimizer
model = SKIP_BINARY(nwords=nwords, emb_size=EMB_SIZE,nbits=nbits)
if USE_CUDA:
    model = model.cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [5]:
# convert a (nested) list of int into a pytorch tensor
def convert_to_tensor(words):
    var = torch.LongTensor(words)
#     if USE_CUDA:
#         var = var.cuda()
    return var

# convert a (nested) list of int into a pytorch Variable
def convert_to_variable(tensor):
    var = Variable(tensor)
    if USE_CUDA:
        var = var.cuda()

    return var

def convert_to_loader(train):
    
    all_histories = []
    all_targets = []
    for sent in train:
        if len(sent)==0:
            continue

        for i in range(0,len(sent)):
            pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] +
                     [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)])
            word_repr = [[float(y) for y in np.binary_repr(x).zfill(nbits)] for x in pos_words]
            all_histories.extend(pos_words)
            all_targets.extend(word_repr)


    train_dataset = TensorDataset(data_tensor=convert_to_tensor(all_histories).view(-1,1),target_tensor=torch.FloatTensor(all_targets))
    train_loader = DataLoader(
        dataset=train_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               
        num_workers=0
    )
    return train_loader

train_loader=convert_to_loader(train)
dev_loader=convert_to_loader(dev)


In [10]:

last_dev = 1e20
best_dev = 1e20

for ITER in range(5):
    # Perform training
    model.train()
    train_words, train_loss = 0, 0.0
    start = time.time()
    for step, (batch_x, batch_y) in enumerate(train_loader):
        
        batch_x=convert_to_variable(batch_x)
        batch_y=convert_to_variable(batch_y.float())
        #print(sent)
        predictions=model(batch_x)
        my_loss = nn.functional.binary_cross_entropy_with_logits(predictions, batch_y,size_average=False)
        train_loss += my_loss.data[0]
        train_words += len(batch_y)
        optimizer.zero_grad()
        my_loss.backward()
        optimizer.step()
    print("iter %r: train loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), train_words/(time.time()-start)))

    # Evaluate on dev set
    # set the model to evaluation mode
    model.eval()
    dev_words, dev_loss = 0, 0.0
    start = time.time()
    for step,(batch_x, batch_y) in enumerate(dev_loader):
        batch_x=convert_to_variable(batch_x)
        batch_y=convert_to_variable(batch_y.float())
        predictions=model(batch_x)
        my_loss = nn.functional.binary_cross_entropy_with_logits(predictions, batch_y,size_average=False)
        dev_loss += my_loss.data[0]
        dev_words += len(batch_y)

    # Keep track of the development accuracy and reduce the learning rate if it got worse
    if last_dev < dev_loss:
        for g in optimizer.param_groups:
            g['lr']/=2
        print('lr decay')
    last_dev = dev_loss

    # Keep track of the best development accuracy, and save the model only if it's the best one
    if best_dev > dev_loss:
        print("saving embedding files")
        with open(embeddings_location, 'w') as embeddings_file:
            W_w_np = model.embedding.weight.cpu().data.numpy()
            for i in range(nwords):
                ith_embedding = '\t'.join(map(str, W_w_np[i]))
                embeddings_file.write(ith_embedding + '\n')
        best_dev = dev_loss
        
        # Save the model
    print("iter %r: dev loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words/(time.time()-start)))



iter 0: train loss/word=1.9138, ppl=6.7785 (word/sec=97757.87)
saving embedding files
iter 0: dev loss/word=0.7790, ppl=2.1793 (word/sec=96056.53)
iter 1: train loss/word=0.2681, ppl=1.3075 (word/sec=105249.64)
saving embedding files
iter 1: dev loss/word=0.2234, ppl=1.2503 (word/sec=84385.74)
iter 2: train loss/word=0.0476, ppl=1.0488 (word/sec=98198.59)
saving embedding files
iter 2: dev loss/word=0.0798, ppl=1.0831 (word/sec=85353.18)
iter 3: train loss/word=0.0073, ppl=1.0074 (word/sec=90068.11)
saving embedding files
iter 3: dev loss/word=0.0249, ppl=1.0253 (word/sec=87540.20)
iter 4: train loss/word=0.0011, ppl=1.0011 (word/sec=92659.46)
saving embedding files
iter 4: dev loss/word=0.0089, ppl=1.0090 (word/sec=102317.87)
