In [106]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import matplotlib
from collections import defaultdict

import numpy as np
import torch

import nltk
nltk.download('punkt')

import pickle
import dill
import pprint

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load InferSent

In [2]:
import torch
# if you are on GPU (encoding ~1000 sentences/s, default)
# if you are on CPU (~40 sentences/s)
infersent = torch.load('infersent.allnli.pickle', map_location=lambda storage, loc: storage)
glove_path = 'SentEval/pretrained/glove.840B.300d.txt'
infersent.set_glove_path(glove_path)
infersent.build_vocab_k_words(K=5000000)



Vocab size : 2196016


In [60]:
with open('europarl_unigram.pickle', 'rb') as f:
    unigram = dill.load(f)

In [None]:
my_sent = 'Mark rutte is the president of the netherlands'
_, _,y = infersent.visualize(my_sent,tokenize=True,visualize=True)

In [40]:
skipgram_path = 'SkipGram/SG/skipgram.300d.txt'

def load_skipgram(path):
    
    word_vec = {}
    
    with open(path,'r') as f:
        for line in f:
            word, vec = line.split(' ', 1)
            word_vec[word] = np.fromstring(vec, sep=' ')
     
    # Convert to defaultdict to return 0 if a word is unknown
    word_vec = defaultdict(lambda: word_vec['UNK'], word_vec)
                
    return word_vec

skipgram_word_vec = load_skipgram(skipgram_path)

In [101]:
def generic_batcher(params, batch, func):
    '''Implements the main loop of getting a batch of embeddings from the EmbedAlign model.
    
         _func_: should be a function that takes a [batch_size, sentence_length, embeddings_dim] matrix
         and a list with the words in the sentence as arguments: func(z_batch, sentence). This function
         should return a [1, embeddings_dim] matrix with the sentence representation'''
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []
    for sent in batch:
        
        sent = [word.lower() for word in sent if word in infersent.word_vec]
        
        if len(sent) == 0:
            sent_vec = skipgram_word_vec['UNK']
        else:
            z_batch1 = np.zeros([len(sent),300])

            for i,word in enumerate(sent):
                z_batch1[i,:] = skipgram_word_vec[word]

            z_batch1 = np.expand_dims(z_batch1,0)

            # Sentence embedding is a function of the words in a sentence
            # [1, z_dim]
            sent_vec = func(z_batch1, sent)
        
        # check if there is any NaN in vector (they appear sometimes when there's padding)
        if np.isnan(sent_vec.sum()):
            sent_vec = np.nan_to_num(sent_vec)   
            
        embeddings.append(sent_vec)
    embeddings = np.vstack(embeddings)
    return embeddings

def simple_mean_batcher(params, batch):
    out = generic_batcher(params, batch, lambda x, _: np.mean(x, axis=1))
    return out

def sif_weighted_batcher(params, batch):  
    
    def sif(z_batch1, sent):
        
        # Hyperparameter value taken from https://openreview.net/pdf?id=SyK00v5xx
        a = 10e-3
        
        # Gather the weights for this sentence (https://openreview.net/pdf?id=SyK00v5xx)
        weights = [a / (a + unigram[word]) for word in sent]
        
        # every sequence starts with 2, which is the id for -NULL- (assuming that is padding)
        # so it gets a weight of 0
        #weights = [0] + weights
        weights = np.array(weights)
        
        # Add dimension to weight array to match shape of z_batch1
        weights = weights.reshape(-1, z_batch1.shape[1], 1)
        
        # Sentence embedding is the average of the words in a sentence (weighted by their unigram prob)
        # [1, z_dim]
        return np.mean(weights * z_batch1, 1)
    

    return generic_batcher(params, batch, sif)

def infersent_batcher(params, batch):
    
    def infersent_weighting(z_batch1, sent):
        # Calculate importance of words, and remove start and stop symbol
        _,_,weights = infersent.visualize(' '.join(sent), tokenize=False, visualize=False)

        #weights = redistribute_start_stop(weights)
        
        # every sequence starts with 2, which is the id for -NULL- (assuming that is padding)
        # so it gets a weight of 0
        weights = np.array(weights)
        # Add dimensions to weight array to match shape of z_batch1
        weights = weights.reshape(-1, z_batch1.shape[1], 1)
        # Sentence embedding is the average of the words in a sentence
        # [1, z_dim]
        out = np.sum(weights * z_batch1, 1)
        return out
        
    return generic_batcher(params, batch, infersent_weighting)


def redistribute_start_stop(y):
    '''Function to redistribute the start and stop words importance over the rest'''
    
    # Remove first and last element (start and stop symbol)
    del y[0]
    del y[-1]
    
    # convert to float
    y = [float(i) for i in y]
    
    # Make it sum to 1.0 instead of 100.0
    y = [i/100.0 for i in y]
    
    # Divide by the sum, to give all other words equall extra mass
    y = [i/sum(y) for i in y]

    return y

In [None]:
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn
import SentEval.examples.data as data
import os

# Set PATHs
# path to senteval
PATH_TO_SENTEVAL = 'SentEval'
# path to the NLP datasets 
PATH_TO_DATA = os.path.join('SentEval','data')
# path to skipgram embeddings
PATH_TO_VEC = os.path.join('SkipGram','SG','skipgram.300d.txt')


# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval


def prepare(params, samples):
    """
    In this example we are going to load Glove, 
    here you will initialize your model.
    remember to add what you model needs into the params dictionary
    """
    _, params.word2id = data.create_dictionary(samples)
    # load glove/word2vec format 
    params.word_vec = data.get_wordvec(PATH_TO_VEC, params.word2id)
    #params.word_vec = defaultdict(lambda: params.word_vec['UNK'], params.word_vec)
    # dimensionality of glove embeddings
    params.wvec_dim = 300
    return


# Set params for SentEval
# we use logistic regression (usepytorch: Fasle) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
# for example the path to a dictionary of indices, of hyper parameters
# this dictionary is passed to the batched and the prepare fucntions
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10}
# this is the config for the NN classifier but we are going to use scikit-learn logistic regression with 10 kfold
# usepytorch = False 
#params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
#                                 'tenacity': 3, 'epoch_size': 2}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    
    # senteval prints the results and returns a dictionary with the scores
    all_results = []
    for batcher in [simple_mean_batcher,sif_weighted_batcher,infersent_batcher]:
        se = senteval.engine.SE(params_senteval, batcher, prepare)

        # here you define the NLP taks that your embedding model is going to be evaluated
        # in (https://arxiv.org/abs/1802.05883) we use the following :
        # SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
        # but STS14 (semantic textual similarity) is a similar type of semantic task
    #     transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC',
    #                       'MRPC', 'SICKEntailment']
        transfer_tasks = ['MPQA','MR', 'CR',  'SUBJ', 'SST2', 'TREC',
                      'MRPC', 'SICKEntailment']
        # senteval prints the results and returns a dictionary with the scores
        results = se.eval(transfer_tasks)
        all_results.append(results)
        pprint.pprint(results)

2018-05-24 16:44:35,498 : ***** Transfer task : MPQA *****


2018-05-24 16:44:37,361 : Found 5454 words with word vectors, out of         6241 words
2018-05-24 16:44:37,596 : Generating sentence embeddings
2018-05-24 16:44:38,122 : Generated sentence embeddings
2018-05-24 16:44:38,122 : Training sklearn-LogReg with (inner) 10-fold cross-validation
2018-05-24 16:45:15,945 : Best param found at split 1: l2reg = 0.25                 with score 82.66
2018-05-24 16:45:58,765 : Best param found at split 2: l2reg = 0.5                 with score 82.94
2018-05-24 16:46:56,713 : Best param found at split 3: l2reg = 0.5                 with score 83.02
2018-05-24 16:48:01,877 : Best param found at split 4: l2reg = 4                 with score 83.09
2018-05-24 16:48:47,792 : Best param found at split 5: l2reg = 0.5                 with score 82.9
2018-05-24 16:49:40,236 : Best param found at split 6: l2reg = 2                 with score 82.91
