# Learning Word Representations

In [1]:
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import string
import io
from collections import defaultdict
from collections import Counter
import itertools
import os
import subprocess
import itertools
import sys
import time
import pickle
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Volumes/Macintosh_HD/Users/Aron/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import torch
from torch.autograd import Variable
from torch.nn import functional as F
from torch.optim import Adam, SGD
from torch.distributions.multivariate_normal import MultivariateNormal
from torch.distributions.kl import kl_divergence
import torch.optim as optim

print(torch.__version__)
print(torch.cuda.is_available())

0.4.0
False


In [0]:
def read_sentences(file):
    '''Read text in file and tokenize sentences'''
    sentences = []

    stopWords = set(stopwords.words('english'))
    
    with open(file) as f:
        for line in f.readlines():
            tokens = line.split()
            tokens = [token.lower() for token in tokens if token not in stopWords]
            tokens = list(filter(lambda x: x not in string.punctuation, tokens))
            sentences.append(tokens)

    return sentences

def make_window_vectors(sentence, idx, half_window, w2i):
    # Boundaries of winodow
    start_idx = max(idx - half_window, 0)
    end_idx = min(len(sentence), idx + half_window + 1)

    # Target
    target_word = sentence[idx]

    # Context
    context = sentence[start_idx:idx] + sentence[idx+1:end_idx]

    # Wrap data in Variable
    target_tensor = Variable(torch.LongTensor([w2i[target_word]]))
    context_tensor = Variable(torch.LongTensor([w2i[word] for word in context]))
    
    return target_tensor, context_tensor
    

def generate_bsg_data(tokenized_sentences, window_size, w2i):
    
    assert window_size % 2 == 0, "Window size should be an even number"
    half_window = int(window_size / 2)
    
    for sentence in tokenized_sentences:  
                
        # Skip single token sentences as they have no context and break the
        # model
        if len(sentence) == 1: continue
            
        sentence = half_window * ["<s>"] + sentence + half_window * ["</s>"]
        
        for i in range(half_window, len(sentence) - half_window):

            target_tensor, context_tensor = make_window_vectors(sentence, i, half_window, w2i)

            yield target_tensor, context_tensor
            
def get_bsg_batch(generator, batch_size):
    
    targets = []
    contexts = []
    
    for target, context in itertools.islice(generator, batch_size):
        
        targets.append(target)
        contexts.append(context)
        
    targets = torch.stack(targets, 0)
    context = torch.stack(contexts, 0)
            
    return targets, context
            

def create_vocabulary(corpus, n=10000):
    
    all_words = list(itertools.chain(*corpus))
    top_n = set([i[0] for i in Counter(all_words).most_common(n)])
    vocabulary = set(all_words).intersection(top_n)
 

    # Placeholder for unknown words, start en end of senctence
    vocabulary.add("<unk>")
    vocabulary.add("<s>")
    vocabulary.add("</s>")

    word2idx = {}
    idx2word = {}

    for (idx, word) in enumerate(list(vocabulary)):
        word2idx[word] = idx
        idx2word[idx] = word

    # Return the ID for <unk> for new words
    word2idx = defaultdict(lambda: word2idx["<unk>"], word2idx)

    vocabulary_size = len(vocabulary)

    return word2idx, idx2word, vocabulary_size

In [0]:
dev_en = read_sentences(os.path.join('wa', 'dev.en'))
training_en = read_sentences(os.path.join('hansards', 'training.en'))

# Bayesian Skip Gram

In [0]:
class BayesianSkipGram(torch.nn.Module):
    def __init__(self, vocab_size, embeddings_dim):
        super(BayesianSkipGram, self).__init__()
                
        # Map from vocab_size to embedding space
        # These are internal embeddings which we will
        # not inspect for analysis
        self.embed = torch.nn.Embedding(vocab_size, embeddings_dim, max_norm=1.0)
        
        # Linear mapping to mean
        self.linear_mu = torch.nn.Linear(2*embeddings_dim, embeddings_dim)
        
        # Linear mapping to sigma
        self.linear_sigma = torch.nn.Linear(2*embeddings_dim, embeddings_dim)
        
        # Linear mapping from latent space to vocabulary
        self.linear_out = torch.nn.Linear(embeddings_dim, vocab_size)
        
        # Embeddings for priors
        self.mu_prior = torch.nn.Embedding(vocab_size, embeddings_dim, max_norm=1.0)
        self.sigma_prior = torch.nn.Embedding(vocab_size, embeddings_dim, max_norm=1.0)
        
    def reparameterize(self, mu, std):
        # Sample from standard normal
        eps = torch.randn_like(std)
        
        # Multiply by scale, add location
        z = eps.mul(std.data).add(mu.data)
        
        return z
    
        
    def encode(self, target, context):
        
        batch_size = target.size(0)
        
        # Lookup internal embeddings
        target_emb = self.embed(target)
        context_emb = self.embed(context)
                
        # Repeat target to number of context words
        window_size = context.size(1)
        
        target_emb = target_emb.repeat(1, window_size, 1).view(batch_size, window_size, -1)
        
        # Concatenate and sum context
        combined = torch.cat((target_emb, context_emb), 2)
        combined = F.relu(combined)
        summed = torch.sum(combined, 1)
                                
        # Estimate mu and sigma transformations
        mu = self.linear_mu(summed)
        sigma = torch.sqrt(F.softplus(self.linear_sigma(summed)))
        
        return mu, sigma
        
    def decode(self, z):
        return F.log_softmax(self.linear_out(z), 1)
     
    def forward(self, target, context):
        mu, sigma = self.encode(target, context)
        z = Variable(self.reparameterize(mu, sigma), requires_grad = True)
        return self.decode(z), mu, sigma

In [0]:
def kl_loss(mu_prior, sigma_prior, mu_data, sigma_data):
    term1 = torch.log(sigma_prior / sigma_data)
    term2 = (sigma_data.pow(2) + (mu_data - mu_prior).pow(2)) / (2 * sigma_prior.pow(2))
    
    kl = torch.sum(term1 + term2, 1) - 0.5
    
    return kl

def bsg_score(prediction, context, mu_prior, sigma_prior, mu_data, sigma_data):
    
    # Take probablities from prediction matrix (reach row is one member of batch)
    probs = Variable(torch.empty_like(context).float())
    for i in range(prediction.size(0)):
        probs[i, :] = prediction[i, context[i]]
                
    # Calculate the loss for each member of the batch
    batch_loss = torch.sum(probs, 1) - kl_loss(mu_prior, sigma_prior, mu_data, sigma_data)
           
    # Average the loss
    return torch.mean(batch_loss).squeeze()

In [0]:
def init_model_dir(model_name, lr, embedding_dim, window_size, batch_size, vocab_size):
    label = "{}_lr-{}_emb-{}_window-{}_batch-{}_vocab-{}".format(model_name, lr, embedding_dim, window_size, batch_size, vocab_size)
    
    # Create directory to save model and save each epoch
    if not os.path.exists(label):
        os.makedirs(label)
        print("First time running hyperparamter settings, created directory.")
            
    return label

def train(n_epochs, batch_size, embedding_dim, corpus, vocab_size, test_corpus, window_size, lr, label):
    w2i, i2w, vocab_size = create_vocabulary(corpus, n=vocab_size)
    model = BayesianSkipGram(vocab_size, embedding_dim)
    obj_func = bsg_score
    optimizer = Adam(model.parameters(), lr)
    
    print(vocab_size)
    
    # Create a directory to save model data (epochs and w2i mapping)
    model_dir = init_model_dir(label, lr, embedding_dim, window_size, batch_size, vocab_size)
    
    try:
        with open(os.path.join(model_dir, sorted(os.listdir(model_dir))[-1]), 'rb') as f:
            w2i = pickle.load(f)
            w2i = defaultdict(lambda: w2i['<unk>'], w2i)
            
        epoch_file_name = sorted(os.listdir(model_dir))[-2]
        epoch_num = epoch_file_name.split('.')[0][3:]
        with open(os.path.join(model_dir, epoch_file_name), 'rb') as f:       
            model = pickle.load(f)
            
        print("Continuing from epoch {}".format(epoch_num))
    except Exception as e:
        print("No trained model found, starting from epoch 0")
        print(e)
        epoch_num = 0
    
    # Save corresponding w2i dict. The ordering can be different every time
    # because vocabulary is created using sets. This way the model and ordering
    # are always paired.
    with open(os.path.join(model_dir, 'w2i.pickle'), 'wb') as f:
        pickle.dump(dict(w2i), f)
    
    for epoch in range(n_epochs):
        if epoch < int(epoch_num): continue
            
        epoch_loss = 0
        examples = 0
        start = time.time()
        
        data = generate_bsg_data(corpus, window_size, w2i)
        
        while True:
            try:
                targets, contexts = get_bsg_batch(data, batch_size)
            except:
                print("Reached end of data")
                break
                
            examples += 1

            # Forward pass
            pred, mu, sigma = model(targets, contexts)
            
            # Calculate loss
            s_prior = F.softplus(model.sigma_prior(targets).squeeze())
            m_prior = model.mu_prior(targets).squeeze()
            
            loss = -obj_func(pred, contexts, m_prior, s_prior, mu, sigma)
            epoch_loss += loss

            # Do SGD step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
                        
            if examples % 1000 == 0:
                sys.stdout.write("\r{:.2f} target number {}".format(time.time() - start, examples * batch_size))
                print()
                            
        with open(os.path.join(model_dir, 'ep-{}.pickle'.format(epoch)), 'wb') as f:
            pickle.dump(model.cpu(), f)

        print("{:.2f} sec: EPOCH {}/{} \t\t LOSS".format(time.time() - start, epoch, n_epochs),
              epoch_loss.item() / examples)
        
        if test_corpus:
            validate_bsg(model, test_corpus, window_size, w2i)
        
    
    return model_dir

def validate_bsg(trained_model, corpus, window_size, w2i):
    
    obj_func = bsg_score
    validation_loss = 0
    examples = 0
    
    data = generate_bsg_data(corpus, window_size, w2i)
    
    trained_model = trained_model.cpu()

    while True:
        try:
            targets, contexts = get_bsg_batch(data, 1)
        except:
            print("Reached end of data")
            break
        
        examples += 1
        # Forward pass
        
        pred, mu, sigma = trained_model(targets, contexts)
        
        # Calculate loss
        s_prior = F.softplus(trained_model.sigma_prior(targets).squeeze())
        m_prior = trained_model.mu_prior(targets).squeeze()
        loss = -obj_func(pred, contexts, m_prior, s_prior, mu, sigma)
        validation_loss += loss
        
        
    print("v \t\t\t\t VALIDATION LOSS {}".format(validation_loss.item() / examples))

In [70]:
dev_en_dir = train(n_epochs=100, 
                   batch_size=20, 
                   embedding_dim=100, 
                   corpus=dev_en,
                   vocab_size=10000,
                   test_corpus=None, 
                   window_size=6, 
                   lr=1e-3, 
                   label="relu-bsg-dev")

251
First time running hyperparamter settings, created directory.
No trained model found, starting from epoch 0
list index out of range
Reached end of data
0.23 sec: EPOCH 0/100 		 LOSS 89.73727213541666
Reached end of data
0.52 sec: EPOCH 1/100 		 LOSS 86.76612955729166
Reached end of data
0.25 sec: EPOCH 2/100 		 LOSS 85.81471354166666
Reached end of data
0.24 sec: EPOCH 3/100 		 LOSS 85.25535481770834
Reached end of data
0.22 sec: EPOCH 4/100 		 LOSS 84.82249348958334
Reached end of data
0.24 sec: EPOCH 5/100 		 LOSS 84.53392740885417
Reached end of data
0.22 sec: EPOCH 6/100 		 LOSS 84.17923177083334
Reached end of data
0.20 sec: EPOCH 7/100 		 LOSS 83.95336100260417
Reached end of data
0.21 sec: EPOCH 8/100 		 LOSS 83.773291015625
Reached end of data
0.24 sec: EPOCH 9/100 		 LOSS 83.63318684895833
Reached end of data
0.22 sec: EPOCH 10/100 		 LOSS 83.52569986979167
Reached end of data
0.22 sec: EPOCH 11/100 		 LOSS 83.41309407552083
Reached end of data
0.20 sec: EPOCH 12/100 		 LO

Reached end of data
0.22 sec: EPOCH 52/100 		 LOSS 81.50603841145833
Reached end of data
0.22 sec: EPOCH 53/100 		 LOSS 81.3439453125
Reached end of data
0.24 sec: EPOCH 54/100 		 LOSS 81.40604654947917
Reached end of data
0.25 sec: EPOCH 55/100 		 LOSS 81.309765625
Reached end of data
0.23 sec: EPOCH 56/100 		 LOSS 81.01993001302084
Reached end of data
0.23 sec: EPOCH 57/100 		 LOSS 81.306103515625
Reached end of data
0.24 sec: EPOCH 58/100 		 LOSS 81.25594075520833
Reached end of data
0.25 sec: EPOCH 59/100 		 LOSS 81.10419108072917
Reached end of data
0.23 sec: EPOCH 60/100 		 LOSS 81.09396158854166
Reached end of data
0.24 sec: EPOCH 61/100 		 LOSS 80.94571126302084
Reached end of data
0.24 sec: EPOCH 62/100 		 LOSS 81.17535807291667
Reached end of data
0.25 sec: EPOCH 63/100 		 LOSS 81.024951171875
Reached end of data
0.24 sec: EPOCH 64/100 		 LOSS 81.138330078125
Reached end of data
0.25 sec: EPOCH 65/100 		 LOSS 80.9467529296875
Reached end of data
0.22 sec: EPOCH 66/100 		 LOSS

# Evaluation

In [0]:
def get_candidates(fileName='lst/lst.gold.candidates'):
    candidates = {}
    
    with open(fileName, 'r') as f:
        for line in f.readlines():
            target, options = line.split('::')
            candidates[target] = [x.strip() for x in options.split(';')]
            
    
    return candidates

def get_test_sentences(fileName='lst/lst_test.preprocessed'):
    
    with open (fileName,'r') as testFile:
        lines = testFile.readlines()
    
    return [line.split() for line in lines]    

In [0]:
def bsg_embedding(model, target, context):
    '''model: trained BayesianSkipGram
       target: 1x1 LongTensor
       context: (S-1)x1 LongTensor'''
    
    mu, sigma = model.encode(target, context)
    qz = MultivariateNormal(mu, torch.diag(sigma.squeeze()))
       
    return qz
    
def evaluate_bsg(model, targ_idx, candidates, context=None, w2i=None):
    
    scored_candidates = []
    
    candidate_lookup = torch.LongTensor([w2i[x] for x in candidates]).unsqueeze(0)
    target, context = make_window_vectors(context, targ_idx, 1, w2i)
    target = target.unsqueeze(0)
    context = context.unsqueeze(0)
    
    qz_target = bsg_embedding(model, target, context)
    
    for i in range(len(candidates)):
        
        qz_cand = bsg_embedding(model, candidate_lookup[0, i].unsqueeze(0), context)
        
        score = kl_divergence(qz_target, qz_cand)
        scored_candidates.append((candidates[i], score.data[0].item()))
        
    scored_candidates = sorted(scored_candidates, key = lambda x: x[1])
    return scored_candidates
        
def run_lst(model, w2i, eval_func):
    
    # Load evaluation data
    sentences = get_test_sentences('lst/lst_test.preprocessed')
    all_candidates = get_candidates('lst/lst.gold.candidates')
    
    result = []
    
    for sentence in sentences:        
        # Encode the target withouth the POS-tag
        target = sentence[0]
        target_stripped = target[:-2]
        
        # Extract id's
        sent_id = sentence[1]
        targ_idx = int(sentence[2])
        
        # Remove target word from sentence to obtain context
        context = sentence[3:].copy()
        
        scores = eval_func(model, targ_idx, all_candidates[target], context, w2i)
        result.append(format_ranking_string(target, sent_id, scores))
    
    # save output to file
    with open(os.path.join('lst', 'lst.out'), 'w') as f:
        f.writelines(result)
        
    # call evaluation scripts
    subprocess.Popen('python lst/lst_gap.py lst/lst_test.gold lst/lst.out out no-mwe'.split())
        
def format_ranking_string(target, sentence_id, scores):
        
    base = "RANKED\t{} {}".format(target, sentence_id)
    candidates = "".join("\t{} {}".format(x[0], x[1]) for x in scores)
    
    return base + candidates + "\n"
        
def load_model(model_dir, model_file):
    '''model_dir: path of DIRECTORY were model is stored,
       model_file: filename of epoch you want to load'''
    
    with open(os.path.join(model_dir, 'w2i.pickle'), 'rb') as f:
        w2i = pickle.load(f)
        w2i = defaultdict(lambda: w2i['<unk>'], w2i)
    
    with open(os.path.join(model_dir, model_file), 'rb') as f:
        model = pickle.load(f)
        
    return model, w2i

In [0]:
# RUN LST ON TRAINED BSG
model_dir = os.path.join('bsg-dev_lr-0.001_emb-100_window-6_batch-20_vocab-251')
trained_model, w2i = load_model(model_dir, 'ep-99.pickle')
i2w = {i : w for (w, i) in w2i.items()}
run_lst(trained_model, w2i, evaluate_bsg)

In [72]:
'''Rank the (context agnositc) embeddings of the trained model on proximity to
the first entry'''

street = torch.LongTensor([0])
embs = trained_model.mu_prior.weight
sims = []

for i, emb in enumerate(embs):
    word = torch.LongTensor([w2i[i]])
    sim = F.cosine_similarity(embs[street], emb.unsqueeze(0))
    sims.append((i2w[i], sim.item()))

[('little', 1.0), ('feeling', 0.30543777346611023), ('mechanism', 0.2735344469547272), ('simply', 0.2567277252674103), ('work', 0.2533694803714752), ('equity', 0.24713867902755737), ('ought', 0.23902331292629242), ('link', 0.22124700248241425), ('fee', 0.22064056992530823), ('particular', 0.2205219566822052), ('wanted', 0.21807803213596344), ('around', 0.21661607921123505), ('conclude', 0.21308422088623047), ('unlawful', 0.19297336041927338), ('look', 0.19210302829742432), ('i', 0.19145900011062622), ('prison', 0.18539828062057495), ('28', 0.18450623750686646), ('understandably', 0.17495976388454437), ('became', 0.17223027348518372), ('purposes', 0.17209811508655548), ('speaker', 0.17059090733528137), ('hard', 0.16529786586761475), ('effective', 0.16327127814292908), ('responsible', 0.16303321719169617), ('cdc', 0.16204969584941864), ('services', 0.16060471534729004), ('within', 0.15602844953536987), ('creation', 0.1512129306793213), ('minister', 0.15026213228702545), ('something', 0.1