In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import matplotlib
from collections import defaultdict

import numpy as np
import torch

import nltk
nltk.download('punkt')

import pickle
import dill
import pprint

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load InferSent

In [None]:
import torch
# if you are on GPU (encoding ~1000 sentences/s, default)
# if you are on CPU (~40 sentences/s)
infersent = torch.load('infersent.allnli.pickle', map_location=lambda storage, loc: storage)
glove_path = 'SentEval/pretrained/glove.840B.300d.txt'
infersent.set_glove_path(glove_path)
infersent.build_vocab_k_words(K=5000000)



In [3]:
with open('europarl_unigram.pickle', 'rb') as f:
    unigram = dill.load(f)

In [None]:
my_sent = 'Mark rutte is the president of the netherlands'
_, _,y = infersent.visualize(my_sent,tokenize=True,visualize=True)

In [6]:
skipgram_path = 'SkipGram/SG/skipgram.100d.txt'

def load_skipgram(path):
    
    word_vec = {}
    
    with open(path,'r') as f:
        for line in f:
            word, vec = line.split(' ', 1)
            word_vec[word] = np.fromstring(vec, sep=' ')
     
    # Convert to defaultdict to return 0 if a word is unknown
    word_vec = defaultdict(lambda: word_vec['UNK'], word_vec)
                
    return word_vec

skipgram_word_vec = load_skipgram(skipgram_path)

In [7]:
def generic_batcher(params, batch, func):
    '''Implements the main loop of getting a batch of embeddings from the EmbedAlign model.
    
         _func_: should be a function that takes a [batch_size, sentence_length, embeddings_dim] matrix
         and a list with the words in the sentence as arguments: func(z_batch, sentence). This function
         should return a [1, embeddings_dim] matrix with the sentence representation'''
    batch = [sent if sent != [] else ['.'] for sent in batch]
    embeddings = []
    for sent in batch:
        
        sent = [word.lower() for word in sent if word in infersent.word_vec]
        
        if len(sent) == 0:
            sent_vec = skipgram_word_vec['UNK']
        else:
            z_batch1 = np.zeros([len(sent),100])

            for i,word in enumerate(sent):
                z_batch1[i,:] = skipgram_word_vec[word]

            z_batch1 = np.expand_dims(z_batch1,0)

            # Sentence embedding is a function of the words in a sentence
            # [1, z_dim]
            sent_vec = func(z_batch1, sent)
        
        # check if there is any NaN in vector (they appear sometimes when there's padding)
        if np.isnan(sent_vec.sum()):
            sent_vec = np.nan_to_num(sent_vec)   
            
        embeddings.append(sent_vec)
    embeddings = np.vstack(embeddings)
    return embeddings

def simple_mean_batcher(params, batch):
    out = generic_batcher(params, batch, lambda x, _: np.mean(x, axis=1))
    return out

def sif_weighted_batcher(params, batch):  
    
    def sif(z_batch1, sent):
        
        # Hyperparameter value taken from https://openreview.net/pdf?id=SyK00v5xx
        a = 10e-3
        
        # Gather the weights for this sentence (https://openreview.net/pdf?id=SyK00v5xx)
        weights = [a / (a + unigram[word]) for word in sent]
        
        # every sequence starts with 2, which is the id for -NULL- (assuming that is padding)
        # so it gets a weight of 0
        #weights = [0] + weights
        weights = np.array(weights)
        
        # Add dimension to weight array to match shape of z_batch1
        weights = weights.reshape(-1, z_batch1.shape[1], 1)
        
        # Sentence embedding is the average of the words in a sentence (weighted by their unigram prob)
        # [1, z_dim]
        return np.mean(weights * z_batch1, 1)
    

    return generic_batcher(params, batch, sif)

def infersent_batcher(params, batch):
    
    def infersent_weighting(z_batch1, sent):
        # Calculate importance of words, and remove start and stop symbol
        _,_,weights = infersent.visualize(' '.join(sent), tokenize=False, visualize=False)

        #weights = redistribute_start_stop(weights)
        
        # every sequence starts with 2, which is the id for -NULL- (assuming that is padding)
        # so it gets a weight of 0
        weights = np.array(weights)
        # Add dimensions to weight array to match shape of z_batch1
        weights = weights.reshape(-1, z_batch1.shape[1], 1)
        # Sentence embedding is the average of the words in a sentence
        # [1, z_dim]
        out = np.sum(weights * z_batch1, 1)
        return out
        
    return generic_batcher(params, batch, infersent_weighting)


def redistribute_start_stop(y):
    '''Function to redistribute the start and stop words importance over the rest'''
    
    # Remove first and last element (start and stop symbol)
    del y[0]
    del y[-1]
    
    # convert to float
    y = [float(i) for i in y]
    
    # Make it sum to 1.0 instead of 100.0
    y = [i/100.0 for i in y]
    
    # Divide by the sum, to give all other words equall extra mass
    y = [i/sum(y) for i in y]

    return y

In [8]:
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

from __future__ import absolute_import, division, unicode_literals

import sys
import numpy as np
import logging
import sklearn
import SentEval.examples.data as data
import os

# Set PATHs
# path to senteval
PATH_TO_SENTEVAL = 'SentEval'
# path to the NLP datasets 
PATH_TO_DATA = os.path.join('SentEval','data')
# path to skipgram embeddings
PATH_TO_VEC = os.path.join('SkipGram','SG','skipgram.100d.txt')


# import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval


def prepare(params, samples):
    """
    In this example we are going to load Glove, 
    here you will initialize your model.
    remember to add what you model needs into the params dictionary
    """
    _, params.word2id = data.create_dictionary(samples)
    # load glove/word2vec format 
    params.word_vec = data.get_wordvec(PATH_TO_VEC, params.word2id)
    #params.word_vec = defaultdict(lambda: params.word_vec['UNK'], params.word_vec)
    # dimensionality of glove embeddings
    params.wvec_dim = 100
    return


# Set params for SentEval
# we use logistic regression (usepytorch: Fasle) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
# for example the path to a dictionary of indices, of hyper parameters
# this dictionary is passed to the batched and the prepare fucntions
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 10}
# this is the config for the NN classifier but we are going to use scikit-learn logistic regression with 10 kfold
# usepytorch = False 
#params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
#                                 'tenacity': 3, 'epoch_size': 2}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    
    # senteval prints the results and returns a dictionary with the scores
    all_results = []
    for batcher in [simple_mean_batcher,sif_weighted_batcher,infersent_batcher]:
        se = senteval.engine.SE(params_senteval, batcher, prepare)

        # here you define the NLP taks that your embedding model is going to be evaluated
        # in (https://arxiv.org/abs/1802.05883) we use the following :
        # SICKRelatedness (Sick-R) needs torch cuda to work (even when using logistic regression), 
        # but STS14 (semantic textual similarity) is a similar type of semantic task
    #     transfer_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC',
    #                       'MRPC', 'SICKEntailment']
        transfer_tasks = ['STS14']
        # senteval prints the results and returns a dictionary with the scores
        results = se.eval(transfer_tasks)
        all_results.append(results)
        pprint.pprint(results)

2018-05-31 13:24:34,134 : ***** Transfer task : STS14 *****


2018-05-31 13:24:35,649 : Found 6593 words with word vectors, out of         9119 words
2018-05-31 13:24:35,980 : deft-forum : pearson = 0.2630, spearman = 0.2720
2018-05-31 13:24:36,160 : deft-news : pearson = 0.4839, spearman = 0.5006
2018-05-31 13:24:36,406 : headlines : pearson = 0.3499, spearman = 0.3721
2018-05-31 13:24:36,521 : images : pearson = 0.3121, spearman = 0.3442
2018-05-31 13:24:36,642 : OnWN : pearson = 0.3692, spearman = 0.4319
2018-05-31 13:24:36,861 : tweet-news : pearson = 0.5347, spearman = 0.5407
2018-05-31 13:24:36,862 : ALL (weighted average) : Pearson = 0.3835,             Spearman = 0.4105
2018-05-31 13:24:36,863 : ALL (average) : Pearson = 0.3855,             Spearman = 0.4103

2018-05-31 13:24:36,896 : ***** Transfer task : STS14 *****




{'STS14': {'OnWN': {'nsamples': 750,
                    'pearson': (0.36923874653674565, 1.2208690052185553e-25),
                    'spearman': SpearmanrResult(correlation=0.43193794125032814, pvalue=1.9346667701511436e-35)},
           'all': {'pearson': {'mean': 0.3854760968788589,
                               'wmean': 0.38345644721263866},
                   'spearman': {'mean': 0.4102771480702317,
                                'wmean': 0.4104968263756487}},
           'deft-forum': {'nsamples': 450,
                          'pearson': (0.26303287783165336,
                                      1.4762935258267863e-08),
                          'spearman': SpearmanrResult(correlation=0.27200039826885647, pvalue=4.494368164603215e-09)},
           'deft-news': {'nsamples': 300,
                         'pearson': (0.4839353234621651, 5.114639356433924e-19),
                         'spearman': SpearmanrResult(correlation=0.5006309953926736, pvalue=1.9514274240698047e-20)},
  

2018-05-31 13:24:37,491 : Found 6593 words with word vectors, out of         9119 words
2018-05-31 13:24:37,593 : deft-forum : pearson = 0.2770, spearman = 0.2818
2018-05-31 13:24:37,660 : deft-news : pearson = 0.5348, spearman = 0.5393
2018-05-31 13:24:37,796 : headlines : pearson = 0.3871, spearman = 0.4000
2018-05-31 13:24:37,969 : images : pearson = 0.5026, spearman = 0.5069
2018-05-31 13:24:38,090 : OnWN : pearson = 0.4439, spearman = 0.4880
2018-05-31 13:24:38,225 : tweet-news : pearson = 0.5140, spearman = 0.5161
2018-05-31 13:24:38,226 : ALL (weighted average) : Pearson = 0.4456,             Spearman = 0.4592
2018-05-31 13:24:38,227 : ALL (average) : Pearson = 0.4432,             Spearman = 0.4554

2018-05-31 13:24:38,256 : ***** Transfer task : STS14 *****




{'STS14': {'OnWN': {'nsamples': 750,
                    'pearson': (0.44391567187022796, 1.4671180263956508e-37),
                    'spearman': SpearmanrResult(correlation=0.4880091686896775, pvalue=3.9186532413790175e-46)},
           'all': {'pearson': {'mean': 0.44324004632893915,
                               'wmean': 0.4455511366195573},
                   'spearman': {'mean': 0.4553572265869041,
                                'wmean': 0.45916562364087893}},
           'deft-forum': {'nsamples': 450,
                          'pearson': (0.2770147505575314,
                                      2.2673018729385472e-09),
                          'spearman': SpearmanrResult(correlation=0.2818335226815887, pvalue=1.1594003611406394e-09)},
           'deft-news': {'nsamples': 300,
                         'pearson': (0.5347978244213928,
                                     1.3582498698017286e-23),
                         'spearman': SpearmanrResult(correlation=0.539303053740657,

2018-05-31 13:24:38,822 : Found 6593 words with word vectors, out of         9119 words
  batch = Variable(self.get_batch(sent), volatile=True)
2018-05-31 13:26:58,301 : deft-forum : pearson = 0.2209, spearman = 0.2052
2018-05-31 13:29:42,667 : deft-news : pearson = 0.5565, spearman = 0.5677
2018-05-31 13:32:31,425 : headlines : pearson = 0.4096, spearman = 0.4277
2018-05-31 13:36:04,330 : images : pearson = 0.4406, spearman = 0.4462
2018-05-31 13:39:36,063 : OnWN : pearson = 0.3775, spearman = 0.4306
2018-05-31 13:44:13,479 : tweet-news : pearson = 0.5702, spearman = 0.5570
2018-05-31 13:44:13,479 : ALL (weighted average) : Pearson = 0.4306,             Spearman = 0.4423
2018-05-31 13:44:13,479 : ALL (average) : Pearson = 0.4292,             Spearman = 0.4391



{'STS14': {'OnWN': {'nsamples': 750,
                    'pearson': (0.3774551422729747, 8.305303635897595e-27),
                    'spearman': SpearmanrResult(correlation=0.430554103434827, pvalue=3.358414710554437e-35)},
           'all': {'pearson': {'mean': 0.42920897876402503,
                               'wmean': 0.4305956548700482},
                   'spearman': {'mean': 0.43906238124150976,
                                'wmean': 0.4423328632846735}},
           'deft-forum': {'nsamples': 450,
                          'pearson': (0.2209048071089363,
                                      2.2270822157449945e-06),
                          'spearman': SpearmanrResult(correlation=0.20518448608946446, pvalue=1.1477705876368855e-05)},
           'deft-news': {'nsamples': 300,
                         'pearson': (0.5565227923172251, 8.527213471786064e-26),
                         'spearman': SpearmanrResult(correlation=0.5677269609831752, pvalue=5.37766050511063e-27)},
        

In [12]:
for result in all_results:
    pprint.pprint(result)

{'CR': {'acc': 72.71, 'devacc': 72.9, 'ndev': 3775, 'ntest': 3775},
 'MPQA': {'acc': 81.72, 'devacc': 81.84, 'ndev': 10606, 'ntest': 10606},
 'MR': {'acc': 65.4, 'devacc': 65.49, 'ndev': 10662, 'ntest': 10662},
 'MRPC': {'acc': 70.32,
          'devacc': 70.61,
          'f1': 79.68,
          'ndev': 4076,
          'ntest': 1725},
 'SICKEntailment': {'acc': 74.65, 'devacc': 73.4, 'ndev': 500, 'ntest': 4927},
 'SST2': {'acc': 66.28, 'devacc': 68.35, 'ndev': 872, 'ntest': 1821},
 'SUBJ': {'acc': 84.27, 'devacc': 84.28, 'ndev': 10000, 'ntest': 10000},
 'TREC': {'acc': 78.6, 'devacc': 70.22, 'ndev': 5452, 'ntest': 500}}
{'CR': {'acc': 72.31, 'devacc': 72.26, 'ndev': 3775, 'ntest': 3775},
 'MPQA': {'acc': 81.79, 'devacc': 81.84, 'ndev': 10606, 'ntest': 10606},
 'MR': {'acc': 65.51, 'devacc': 65.85, 'ndev': 10662, 'ntest': 10662},
 'MRPC': {'acc': 69.33,
          'devacc': 70.83,
          'f1': 79.69,
          'ndev': 4076,
          'ntest': 1725},
 'SICKEntailment': {'acc': 72.97, 'de