# Learning Word Representations

In [2]:
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import string
import io
from collections import defaultdict
from collections import Counter
import itertools
import os
import subprocess
import itertools
import sys
import cProfile

try:
    from google.colab import files
except:
    print("Google not needed on local runtime")
    

import time
import pickle
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

Google not needed on local runtime
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import torch
from torch.autograd import Variable
from torch.nn import functional as F
from torch.optim import Adam, SGD
from torch.distributions.multivariate_normal import MultivariateNormal
from torch.distributions.kl import kl_divergence
import torch.optim as optim

print(torch.__version__)
print(torch.cuda.is_available())

0.4.0
False


In [4]:
def read_sentences(file):
    '''Read text in file and tokenize sentences'''
    sentences = []

    stopWords = set(stopwords.words('english'))
    
    with open(file,encoding='utf8') as f:
        for line in f.readlines():
            tokens = line.split()
            tokens = [token.lower() for token in tokens if token not in stopWords]
            tokens = list(filter(lambda x: x not in string.punctuation, tokens))
            sentences.append(tokens)

    return sentences

def generate_emba_data(lang_1, lang_2, w2i_1, w2i_2):
    
    for i in range(len(lang_1)):
        sentence1 = lang_1[i]
        sentence2 = lang_2[i]
        
        # Skip if any of the sentences has length 0
        if min(len(sentence1), len(sentence2)) == 0 or len(sentence1) == 1:
            continue
        
        
        sentence1_tensor = torch.LongTensor([w2i_1[x] for x in sentence1])
        sentence2_tensor = torch.LongTensor([w2i_2[x] for x in sentence2])
        
        yield sentence1_tensor, sentence2_tensor

def create_vocabulary(corpus, n=10000):
    
    all_words = list(itertools.chain(*corpus))
    top_n = set([i[0] for i in Counter(all_words).most_common(n)])
    vocabulary = set(all_words).intersection(top_n)
 

    # Placeholder for unknown words, start en end of senctence
    vocabulary.add("<unk>")
    vocabulary.add("<s>")
    vocabulary.add("</s>")

    word2idx = {}
    idx2word = {}

    for (idx, word) in enumerate(list(vocabulary)):
        word2idx[word] = idx
        idx2word[idx] = word

    # Return the ID for <unk> for new words
    word2idx = defaultdict(lambda: word2idx["<unk>"], word2idx)

    vocabulary_size = len(vocabulary)

    return word2idx, idx2word, vocabulary_size

In [5]:
dev_en = read_sentences('wa/dev.en')
dev_fr = read_sentences('wa/dev.fr')
test_en = read_sentences('wa/test.en')
test_fr = read_sentences('wa/test.fr')
training_en = read_sentences('hansards/training.en')
training_fr = read_sentences('hansards/training.fr')

# Embed Align

In [6]:
class EmbedAlign(torch.nn.Module):
    
    def __init__(self, vocab_1, vocab_2, embeddings_dim):
        super(EmbedAlign, self).__init__()
        
        # Encoder paramters
        self.embed = torch.nn.Embedding(vocab_1, embeddings_dim)
        self.linear_mu_1 = torch.nn.Linear(2*embeddings_dim, embeddings_dim)
        self.linear_mu_2 = torch.nn.Linear(embeddings_dim, embeddings_dim)
        self.linear_sig_1 = torch.nn.Linear(2*embeddings_dim, embeddings_dim)
        
        # Decode l1 language
        self.linear_l1_1 = torch.nn.Linear(embeddings_dim, vocab_1)
        
        # Decode l2 language
        self.linear_l2_1 = torch.nn.Linear(embeddings_dim, vocab_2)
        
    def reparameterize(self, mu, std):
        # Sample from standard normal
        eps = torch.randn_like(std)
        
        # Multiply by scale, add location
        z = eps.mul(std.data).add(mu.data)
        
        return z
    
    def encode(self, sentence):
        m = sentence.size(0)
        
        h_mu = self.embed(sentence)
        
        # Encode context by averaging over neighbours
        context = []
        for i in range(m):
            other_idx = list(range(m))
            other_idx.remove(i)
            idx = torch.LongTensor(other_idx)
            others = torch.index_select(h_mu, 0, idx)
            avg = torch.mean(others, 0)
            context.append(avg)
            
        # Concatenate target and context
        context = torch.stack(context, 0)
        h_mu = torch.cat((h_mu, context), 1)
        
        # Create a copy for mu and sigma MLPs
        h_sig = torch.empty_like(h_mu)
        h_sig.copy_(h_mu)
        
        # calculate mu
        mu = self.linear_mu_1(h_mu)
        mu = F.relu(mu)
                
        # calculate sigma
        sig = self.linear_sig_1(h_sig)
        sig = F.softplus(sig)
        
        return mu, sig
    
    def decode(self, l1_z):
        l2_z = torch.empty_like(l1_z)
        l2_z.copy_(l1_z)
        
        # l1 lang
        l1_probs = self.linear_l1_1(l1_z)
        l1_probs = F.relu(l1_probs)
        l1_probs = F.log_softmax(l1_probs, dim=1)
        
        # l2 lang
        l2_probs = self.linear_l2_1(l2_z)
        l2_probs = F.relu(l2_probs)
        l2_probs = F.log_softmax(l2_probs, dim=1)
        
        return l1_probs, l2_probs
    
    def forward(self, sentence):
        mu, sigma = self.encode(sentence)
        z = self.reparameterize(mu, sigma)
        l1_out, l2_out = self.decode(z)
        return l1_out, l2_out, mu, sigma

In [7]:
def embalign_loss(l1_sent, l2_sent, l1_probs, l2_probs, mu, sigma):
    
    m = l1_sent.size(0)
    n = l2_sent.size(0)
    
    l1_loss = F.nll_loss(l1_probs, l1_sent)
    
    l2_loss = 0
    for j in range(n):
        l2_loss += F.nll_loss(l2_probs, l2_sent[j].repeat(m)) / m

    # Add epsilon to prevent nan values in log
    sigma = sigma + 1e-6
    kl_loss = torch.sum(-torch.log(sigma) + (sigma.pow(2) + mu.pow(2)) / 2 - 0.5, 1)
        
    return l1_loss + l2_loss + torch.sum(kl_loss, 0)

In [8]:
def init_model_dir(model_name, lr, embedding_dim):
    label = "{}_lr-{}_emb-{}".format(model_name, lr, embedding_dim)
    
    # Create directory to save model and save each epoch
    if not os.path.exists(label):
        os.makedirs(label)
        print("First time running hyperparamter settings, created directory.")
            
    return label

def train(n_epochs, embedding_dim, l1_corpus, l2_corpus, vocab_size, lr, label):
    l1_w2i, _, l1_vocab = create_vocabulary(l1_corpus, n=vocab_size)
    l2_w2i, _, l2_vocab = create_vocabulary(l2_corpus, n=vocab_size)
    model = EmbedAlign(l1_vocab, l2_vocab, embedding_dim)
    optimizer = Adam(model.parameters(), lr)
        
    # Create a directory to save model data (epochs and w2i mapping)
    model_dir = init_model_dir(label, lr, embedding_dim)
    
    try:
        with open(os.path.join(model_dir, 'l1_w2i.pickle'), 'rb') as f:
            l1_w2i = pickle.load(f)
            l1_w2i = defaultdict(lambda: l1_w2i['<unk>'], l1_w2i)
        
        with open(os.path.join(model_dir, 'l2_w2i.pickle'), 'rb') as f:
            l2_w2i = pickle.load(f)
            l2_w2i = defaultdict(lambda: l1_w2i['<unk>'], l1_w2i)
            
        epoch_file_name = sorted(os.listdir(model_dir))[-3]
        epoch_num = epoch_file_name.split('.')[0][3:]
        
        with open(os.path.join(model_dir, epoch_file_name), 'rb') as f:       
            model = pickle.load(f)
            
        print("Continuing from epoch {}".format(epoch_num))
    except Exception as e:
        print("No trained model found, starting from epoch 0")
        print(e)
        epoch_num = 0
    
    # Save corresponding w2i dicts. The ordering can be different every time
    # because vocabulary is created using sets. This way the model and ordering
    # are always paired.
    with open(os.path.join(model_dir, 'l1_w2i.pickle'), 'wb') as f:
        pickle.dump(dict(l1_w2i), f)
    with open(os.path.join(model_dir, 'l2_w2i.pickle'), 'wb') as f:
        pickle.dump(dict(l2_w2i), f)
    
    for epoch in range(n_epochs):
        if epoch < int(epoch_num): continue
            
        epoch_loss = 0
        examples = 0
        start = time.time()
        
        data = generate_emba_data(l1_corpus, l2_corpus, l1_w2i, l2_w2i)
        
        for s1, s2 in data:
  
            examples += 1
            
            # Forward pass
            l1_probs, l2_probs, mu, sigma = model(s1)
            
            # Calculate loss
            loss = embalign_loss(s1, s2, l1_probs, l2_probs, mu, sigma)
            epoch_loss += loss

            # Do SGD step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
                        
            if examples % 1000 == 0:
                sys.stdout.write("\r{:.2f} target number {}".format(time.time() - start, examples))
                print()
                            
        with open(os.path.join(model_dir, 'ep-{}.pickle'.format(epoch)), 'wb') as f:
            pickle.dump(model, f)

        print("{:.2f} sec: EPOCH {}/{} \t\t LOSS".format(time.time() - start, epoch, n_epochs),
              epoch_loss.item() / examples)
        
    
    return model_dir

In [None]:
trained_model_dir = train(n_epochs=3, 
                          embedding_dim=100, 
                          l1_corpus=training_en, 
                          l2_corpus=training_fr, 
                          vocab_size=10000, 
                          lr=1e-3,
                          label="embalign-training")

Continuing from epoch 0
22.82 target number 1000
44.23 target number 2000
64.81 target number 3000
86.27 target number 4000
107.05 target number 5000
127.49 target number 6000
148.43 target number 7000
168.77 target number 8000
189.99 target number 9000
212.22 target number 10000
233.75 target number 11000
255.55 target number 12000
278.34 target number 13000
301.80 target number 14000
324.79 target number 15000
348.53 target number 16000
369.33 target number 17000
391.50 target number 18000
412.90 target number 19000
434.66 target number 20000
458.94 target number 21000
480.61 target number 22000
504.03 target number 23000
528.08 target number 24000
552.18 target number 25000
578.21 target number 26000
603.00 target number 27000
629.13 target number 28000
655.20 target number 29000
682.24 target number 30000
712.17 target number 31000
740.95 target number 32000
770.48 target number 33000
799.86 target number 34000
830.75 target number 35000
861.32 target number 36000
892.76 target num

# Evaluation

In [0]:
def get_candidates(fileName='lst/lst.gold.candidates'):
    candidates = {}
    
    with open(fileName, 'r') as f:
        for line in f.readlines():
            target, options = line.split('::')
            candidates[target] = [x.strip() for x in options.split(';')]
            
    
    return candidates

def get_test_sentences(fileName='lst/lst_test.preprocessed'):
    
    with open (fileName,'r') as testFile:
        lines = testFile.readlines()
    
    return [line.split() for line in lines]    

In [0]:
def run_lst(model, w2i, eval_func):
    
    # Load evaluation data
    sentences = get_test_sentences('lst/lst_test.preprocessed')
    all_candidates = get_candidates('lst/lst.gold.candidates')
    
    result = []
    
    for sentence in sentences:        
        # Encode the target withouth the POS-tag
        target = sentence[0]
        target_stripped = target[:-2]
        
        # Extract id's
        sent_id = sentence[1]
        targ_idx = int(sentence[2])
        
        # Remove target word from sentence to obtain context
        context = sentence[3:].copy()
        
        scores = eval_func(model, targ_idx, all_candidates[target], context, w2i)
        result.append(format_ranking_string(target, sent_id, scores))
    
    # save output to file
    with open(os.path.join('lst', 'lst.out'), 'w') as f:
        f.writelines(result)
        
    # call evaluation scripts
    subprocess.Popen('python lst/lst_gap.py lst/lst_test.gold lst/lst.out out no-mwe'.split())
        
def format_ranking_string(target, sentence_id, scores):
        
    base = "RANKED\t{} {}".format(target, sentence_id)
    candidates = "".join("\t{} {}".format(x[0], x[1]) for x in scores)
    
    return base + candidates + "\n"
        
def load_model(model_dir, model_file):
    '''model_dir: path of DIRECTORY were model is stored,
       model_file: filename of epoch you want to load'''
    
    with open(os.path.join(model_dir, 'w2i.pickle'), 'rb') as f:
        w2i = pickle.load(f)
        w2i = defaultdict(lambda: w2i['<unk>'], w2i)
    
    with open(os.path.join(model_dir, model_file), 'rb') as f:
        model = pickle.load(f)
        
    return model, w2i

In [2]:
def evaluate_embalign(model, targ_idx, candidates, context, w2i):
    # Function to evaluate the Embed Align model
    # INPUT
    # - model, the model which is alreadly loaded from a .pickle file
    # - targ_idx, an integer corresponding to the idx of the target word
    # - candidates, a list of suitable candidates
    # - context, the entire sentence
    # - w2i, word2index dict
    #
    # OUTPUT
    # - scores, a list containing tuples (candidate_word,score)
    
    scores = []
    
    # Convert sentence to idx and LongTensor
    idx_sent = [w2i[word] for word in context]
    idx_sent = torch.LongTensor(idx_sent)
    
    # Run a forward pass
    _, _, mu_targ, sigma_targ = model.forward(idx_sent)
        
    # Retrieve mu[target_idx] and sigma[target_idx] 
    mu_target_idx = mu_targ[targ_idx,:]
    sigma_target_idx = sigma_targ[targ_idx,:]
    
    # Retrieve the target distribution
    target_dist = MultivariateNormal(mu_target_idx,torch.diag(sigma_target_idx))
    
    for c_word in candidates:
        
        # Create the new sentece, and replace a word
        new_sent = copy.copy(context)
        new_sent[targ_idx] = c_word
        
        # Convert to idx and LongTensor
        idx_new_sent = [w2i[word] for word in new_sent]
        idx_new_sent = torch.LongTensor(idx_new_sent)
        
        # Run a forward pass
        _, _, mu_cand, sigma_cand = model.forward(idx_new_sent)
        
        # Retrieve mu[target_idx] and sigma[target_idx] 
        mu_cand_idx = mu_cand[targ_idx,:]
        sigma_cand_idx = sigma_cand[targ_idx,:]
        
        cand_dist = MultivariateNormal(mu_cand_idx,torch.diag(sigma_cand_idx))
    
        # Calculate kl_divergence and convert to float
        kl_div = float(kl_divergence(target_dist,cand_dist))
        
        scores.append((c_word,kl_div))
        
    return scores

In [0]:
# RUN LST ON TRAINED MODEL
model_dir = os.path.join('dev_en_lr-0.01_emb-100_window-6_batch-50_vocab-251')
trained_model, w2i = load_model(model_dir, 'ep-9.pickle')
run_lst(trained_model, w2i, evaluate_embalign)

In [0]:
bank = torch.LongTensor([w2i['street']])
embs = trained_model.mu_prior.weight
sims = []
for i in itertools.chain(*dev_en):
    word = torch.LongTensor([w2i[i]])
    sim = F.cosine_similarity(embs[bank], embs[word])
    sims.append((i, sim.item()))

unk = "<unk>"
word = torch.LongTensor([w2i[unk]])
sim = F.cosine_similarity(embs[bank], embs[word])
sims.append((unk, sim.item()))
print(sorted(sims, key=lambda x: x[1], reverse=True))

## AER

In [1]:
def predict_alignment(model, idx, sentence1, sentence2):
    '''Sentence1: LongTensor corresponding to sentence in l1 language
       sentence2: LongTensor corresponding to sentence in l2 language
       model: EmbedAlign instance to evaluate
       
       return: list of alignments [(sent_idx, w1_idx, w2_idx, prob)]'''
    
    alignment = []
    
    # Do forward pass and take (arg)max
    _, l2_probs, _, _ = model(sentence1)
    
    # make a list of the results
    for l2_idx, lookup in enumerate(sentence2):
        l2_word_probs = l2_probs[:, lookup.item()]
        log_prob, l1_idx = l2_word_probs.max(0)
        prob = torch.exp(log_prob)
        
        # correct to 1-indexation
        l1_idx = l1_idx.item() + 1
        l2_idx += 1
        alignment_data = [idx, l1_idx, l2_idx, prob.item()]
        alignment_data = [str(x) for x in alignment_data]
        alignment_data += ['\n']
        
        alignment.append(' '.join(alignment_data))
        
    return alignment

def save_alignments(model, data):
    
    alignments = []
    i = 1
    for s1, s2 in data:
        
        # Get all alignments for these sentences
        sentence_alignment = predict_alignment(model, i, s1, s2)
        alignments += sentence_alignment
        
        i += 1
   
    # write list to file
    with open('aer_out.txt', 'w') as f:
        f.writelines(alignments)
        

test_data = generate_emba_data(test_en, test_fr, w2i_l1, w2i_l2)
save_alignments(trained_model, test_data)
subprocess.Popen('python wa/aer.py wa/test.naacl wa/aer_out.txt'.split())

NameError: name 'generate_emba_data' is not defined

In [0]:
w2i_en['poep']

In [0]:
# test_model = BayesianSkipGram(10, 3)
torch.max(trained_model.embed.weight)