<a href="https://colab.research.google.com/github/jsedoc/ConceptorDebias/blob/ACL-cleanup/Conceptors/Intrinsic_Evaluation_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unsupervised Post-processing of Word Vectors via Conceptor Negation

In this notebook, we presents the experiment results reported in [1]/

[1] Unsupervised Post-processing of Word Vectors via Conceptor Negation. Tianlin Liu, Lyle Ungar, and João Sedoc, Unsupervised Post-processing of Word Vectors via Conceptor Negation, AAAI 2019.





In [1]:

import numpy as np
import scipy, requests, codecs, os, re, nltk, itertools, csv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering, KMeans
import tensorflow as tf
from scipy.stats import spearmanr
import pandas as pd
import functools as ft

import numpy as np
from itertools import combinations, filterfalse
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.keyedvectors import KeyedVectors
import pandas as pd
import random
import sys
import os
import pickle

# resourceFile = '/Users/liutianlin/Desktop/Academics/NLP/data/' 
resourceFile = '/data/' # the address of the datasets


paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [0]:
!pip install -q gdown
!gdown https://drive.google.com/uc?id=1U_UGB2vyTuTIcbV_oeDtJCtAtlFMvXOM # download a small subset of glove
!gdown https://drive.google.com/uc?id=1j_b4TRpL3f0HQ8mV17_CtOXp862YjxxB   # download a small subset of word2vec

## Load Word2Vec and GloVe word embeddings
 
We provide a small word2vec and small glove word embedding in this repository -- their words appear at least 200 times in wikipedia (see the list provided by Arora et al https://github.com/PrincetonML/SIF/blob/master/auxiliary_data/enwiki_vocab_min200.txt

In [2]:
# our code for debiasing -- also includes word lists
!rm -r ConceptorDebias
!git clone https://github.com/jsedoc/ConceptorDebias
!cd ConceptorDebias; git checkout ACL-cleanup

sys.path.append('/content/ConceptorDebias')

from Conceptors.conceptor_fxns import *

def process_cn_matrix(subspace, alpha = 2):
  """Returns the conceptor negation matrix
  Arguments
           subspace : n x d matrix of word vectors from a oarticular subspace
           alpha : Tunable parameter
  """
  # Compute the conceptor matrix
  C,_ = train_Conceptor(subspace, alpha)
  
  # Calculate the negation of the conceptor matrix
  negC = NOT(C)
  
  return negC

def apply_conceptor(x, C):
  """Returns the conceptored embeddings
  Arguments
           x : n x d matrix of all words to be conceptored
           C : d x d conceptor matrix
  """
  # Post-process the vocab matrix
  newX = (C @ x).T
  
  return newX

def load_all_vectors(embd, wikiWordsPath):
  """Loads all word vectors for all words in the list of words as a matrix
  Arguments
           embd : Dictonary of word-to-embedding for all words
           wikiWordsPath : URL to the path where all embeddings are stored
  Returns
          all_words_index : Dictonary of words to the row-number of the corresponding word in the matrix
          all_words_mat : Matrix of word vectors stored row-wise
  """
  all_words_index = {}
  all_words_mat = []
  with open(wikiWordsPath, "r+") as f_in:
    ind = 0
    for line in f_in:
      word = line.split(' ')[0]
      if word in embd:
        all_words_index[word] = ind
        all_words_mat.append(embd[word])
        ind = ind+1
        
  return all_words_index, all_words_mat

def load_subspace_vectors(embd, subspace_words):
  """Loads all word vectors for the particular subspace in the list of words as a matrix
  Arguments
           embd : Dictonary of word-to-embedding for all words
           subspace_words : List of words representing a particular subspace
  Returns
          subspace_embd_mat : Matrix of word vectors stored row-wise
  """
  subspace_embd_mat = []
  ind = 0
  for word in subspace_words:
    if word in embd:
      subspace_embd_mat.append(embd[word])
      ind = ind+1
      
  return subspace_embd_mat

# General word list
!wget https://raw.githubusercontent.com/IlyaSemenov/wikipedia-word-frequency/master/results/enwiki-20190320-words-frequency.txt
!git clone https://github.com/PrincetonML/SIF
    
# Gender word lists
!git clone https://github.com/uclanlp/gn_glove
!git clone https://github.com/uclanlp/corefBias
!wget https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt
!wget https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt

from lists.load_word_lists import *

"""Load list of pronouns representing the 'Pronoun' subspace for gender debiasing"""
gender_list_pronouns = WEATLists.W_7_Male_terms + WEATLists.W_7_Female_terms + WEATLists.W_8_Male_terms + WEATLists.W_8_Female_terms
gender_list_pronouns = list(set(gender_list_pronouns))

"""Load an extended list of words representing the gender subspace for gender debiasing"""
gender_list_extended = male_vino_extra + female_vino_extra + male_gnGlove + female_gnGlove
gender_list_extended = list(set(gender_list_extended))

"""Load list of proper nouns representing the 'Proper Noun' subspace for gender debiasing"""
gender_list_propernouns = male_cmu + female_cmu
gender_list_propernouns = list(set(gender_list_propernouns))

"""Load list of all representing the gender subspace for gender debiasing"""
gender_list_all = gender_list_pronouns + gender_list_extended + gender_list_propernouns
gender_list_all = list(set(gender_list_all))

"""Load list of common black and white names for racial debiasing"""
race_list = WEATLists.W_3_Unused_full_list_European_American_names + WEATLists.W_3_European_American_names + WEATLists.W_3_Unused_full_list_African_American_names + WEATLists.W_3_African_American_names + WEATLists.W_4_Unused_full_list_European_American_names + WEATLists.W_4_European_American_names + WEATLists.W_4_Unused_full_list_African_American_names + WEATLists.W_4_African_American_names + WEATLists.W_5_Unused_full_list_European_American_names + WEATLists.W_5_European_American_names + WEATLists.W_5_Unused_full_list_African_American_names + WEATLists.W_5_African_American_names 
race_list = list(set(race_list))



rm: cannot remove 'ConceptorDebias': No such file or directory
Cloning into 'ConceptorDebias'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 388 (delta 39), reused 5 (delta 2), pack-reused 308[K
Receiving objects: 100% (388/388), 3.68 MiB | 9.38 MiB/s, done.
Resolving deltas: 100% (205/205), done.
Branch 'ACL-cleanup' set up to track remote branch 'ACL-cleanup' from 'origin'.
Switched to a new branch 'ACL-cleanup'
--2019-04-17 04:05:03--  https://raw.githubusercontent.com/IlyaSemenov/wikipedia-word-frequency/master/results/enwiki-20190320-words-frequency.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27465330 (26M) [text/plain]
Saving to: ‘enwiki

In [7]:
"""Download the 'Glove' embeddings if not downloaded"""
!if [ ! -f /content/gensim_glove.840B.300d.txt.bin ]; then gdown https://drive.google.com/uc?id=1Ty2exMyi-XOufY-v81RJfiPvnintHuy2; fi

"""Load the embeddings to a gensim object"""
resourceFile = ''
if 'glove' not in dir():
  glove = KeyedVectors.load_word2vec_format(resourceFile + 'gensim_glove.840B.300d.txt.bin', binary=True)
  print('The glove embedding has been loaded!')

Downloading...
From: https://drive.google.com/uc?id=1Ty2exMyi-XOufY-v81RJfiPvnintHuy2
To: /content/gensim_glove.840B.300d.txt.bin
2.65GB [00:47, 55.8MB/s]
The glove embedding has been loaded!


In [3]:
"""Download the 'Word2Vec' embeddings if not downloaded"""
!if test -e /content/GoogleNews-vectors-negative300.bin.gz || test -e /content/GoogleNews-vectors-negative300.bin; then echo 'file already downloaded'; else echo 'starting download'; gdown https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM; fi
!if [ ! -f /content/GoogleNews-vectors-negative300.bin ]; then gunzip GoogleNews-vectors-negative300.bin.gz; fi

"""Load the embeddings to a gensim object"""
resourceFile = ''
if 'word2vec' not in dir():
  word2vec = KeyedVectors.load_word2vec_format(resourceFile + 'GoogleNews-vectors-negative300.bin', binary=True)
  print('The word2vec embedding has been loaded!')

starting download
Downloading...
From: https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
To: /content/GoogleNews-vectors-negative300.bin.gz
1.65GB [00:11, 149MB/s]
The word2vec embedding has been loaded!


In [4]:
"""Download the 'Fasttext' embeddings if not downloaded"""
!if [ ! -f /content/fasttext.bin ]; then gdown https://drive.google.com/uc?id=1Zl6a75Ybf8do9uupmrJWKQMnvqqme4fh; fi

"""Load the embeddings to a gensim object"""
resourceFile = ''
if 'fasttext' not in dir():
  fasttext = KeyedVectors.load_word2vec_format(resourceFile + 'fasttext.bin', binary=True)
  print('The fasttext embedding has been loaded!')

Downloading...
From: https://drive.google.com/uc?id=1Zl6a75Ybf8do9uupmrJWKQMnvqqme4fh
To: /content/fasttext.bin
2.42GB [00:41, 58.4MB/s]
The fasttext embedding has been loaded!


In [24]:
resourceFile = ''
wikiWordsPath = resourceFile + 'SIF/auxiliary_data/enwiki_vocab_min200.txt' # https://github.com/PrincetonML/SIF/blob/master/auxiliary_data/enwiki_vocab_min200.txt

"""Set the embedding to be used"""
embd = 'word2vec'

"""Set the subspace to be tested on"""
subspace = 'gender_list_all' 

curr_embd = eval(embd)
  
"""Load all embeddings in a matrix of all words in the wordlist"""
if embd == 'elmo':
  all_words_mat, all_words_index, _ = pick_embeddings(brown_corpus, curr_embd)
if embd == 'bert':
  all_words_index, all_words_mat = load_bert(all_dict, subspace)
else:
  all_words_index, all_words_mat = load_all_vectors(curr_embd, wikiWordsPath)
  
"""Load the vectors for the words representing the subspace as a matrix and compute the respetive conceptor matrix"""
if subspace != 'without_conceptor':
  subspace_words_list = eval(subspace)
  if subspace == 'gender_list_and':
    if embd == 'elmo':
      subspace_words_mat1 = load_subspace_vectors_contextual(all_words_mat, all_words_index, gender_list_pronouns)
      cn1 = process_cn_matrix(np.array(subspace_words_mat1).T, alpha = 8)

      subspace_words_mat2 = load_subspace_vectors_contextual(all_words_mat, all_words_index, gender_list_extended)
      cn2 = process_cn_matrix(np.array(subspace_words_mat2).T, alpha = 3)

      subspace_words_mat3 = load_subspace_vectors_contextual(all_words_mat, all_words_index, gender_list_propernouns)
      cn3 = process_cn_matrix(np.array(subspace_words_mat3).T, alpha = 10)

      cn = AND(cn1, AND(cn2, cn3))
    elif embd == 'bert':
      cn1 = load_bert_conceptor(all_dict, gender_list_pronouns)
      
      cn2 = load_bert_conceptor(all_dict, gender_list_extended)
      
      cn3 = load_bert_conceptor(all_dict, gender_list_propernouns)
      
      cn = AND(cn1, AND(cn2, cn3))
    else:
      subspace_words_mat1 = load_subspace_vectors(curr_embd, gender_list_pronouns)
      cn1 = process_cn_matrix(np.array(subspace_words_mat1).T)

      subspace_words_mat2 = load_subspace_vectors(curr_embd, gender_list_extended)
      cn2 = process_cn_matrix(np.array(subspace_words_mat2).T)

      subspace_words_mat3 = load_subspace_vectors(curr_embd, gender_list_propernouns)
      cn3 = process_cn_matrix(np.array(subspace_words_mat3).T)

      cn = AND(cn1, AND(cn2, cn3))
  else: 
    if embd == 'elmo':
      subspace_words_mat = load_subspace_vectors_contextual(all_words_mat, all_words_index, subspace_words_list)
      cn = process_cn_matrix(np.array(subspace_words_mat).T, alpha = 6)
    elif embd == 'bert':
      cn = load_bert_conceptor(all_dict, subspace)
    else:
      subspace_words_mat = load_subspace_vectors(curr_embd, subspace_words_list)
      cn = process_cn_matrix(np.array(subspace_words_mat).T)
      
"""Conceptor all embeddings"""
all_words_cn = apply_conceptor(np.array(all_words_mat).T, np.array(cn))

"""Store all conceptored words in a dictonary"""
all_words = {}
for word, index in all_words_index.items():
  if embd == 'elmo':
    all_words[word] = np.mean([all_words_cn[i,:] for i in index], axis = 0)
  else:
    all_words[word] = all_words_cn[index,:]

starting...
(300, 7117)
R calculated
C calculated


In [7]:
def loadWordVecs(model_str):
    word_dictionary = {}
    
    input_file_destination = 'small_' + model_str + '.txt'

    f = codecs.open(input_file_destination, 'r', 'utf-8') 

    for line in f:

        line = line.split(" ", 1)   
        transformed_key = line[0].lower()

        try:
            transformed_key = str(transformed_key)

        except:
            print("Can't convert the key to unicode:", transformed_key)

        word_dictionary[transformed_key] = np.fromstring(line[1], dtype="float32", sep=" ")

        if word_dictionary[transformed_key].shape[0] != 300:
            print(transformed_key, word_dictionary[transformed_key].shape)

    return  word_dictionary     



orig_word2vec = loadWordVecs('word2vec')
print("loaded Word2vec!")

orig_glove = loadWordVecs('glove')
print("loaded GloVe Common Crawl!")


orig_model = {}
orig_model['word2vec'] = orig_word2vec
orig_model['glove'] = orig_glove


FileNotFoundError: ignored

## Post-process Word2Vec and GloVe with Conceptor Negation (CN)


In [0]:
def ensemble_cn_dict(wordVecModel_str, alpha = 2, orig_model = orig_model):
    
    
    # put the word vectors in columns
    x_collector = np.array(list(orig_model[wordVecModel_str].values())).T       
        
    
    nrWords = x_collector.shape[1] # number of total words
    
    
    R = x_collector.dot(x_collector.T) / nrWords # calculate the un-centered correlation matrix
    
    C = R @ np.linalg.inv(R + alpha ** (-2) * np.eye(300))# calculate the conceptor matrix
    
    vecMatrix = ((np.eye(300) - C) @ x_collector).T 

    cn_dict = {}
        
    for word_index in np.arange(0, len(orig_model[wordVecModel_str].keys())):
        
        word = list(orig_model[wordVecModel_str].keys())[word_index]
        cn_dict[word] = vecMatrix[word_index,:]
    
    return cn_dict

print("Post-processing Word2vec with CN")
cn_word2vec = ensemble_cn_dict('word2vec', orig_model = orig_model)

print("Post-processing GloVe with CN")
cn_glove = ensemble_cn_dict('glove', orig_model = orig_model)


Post-processing Word2vec with CN
Post-processing GloVe with CN


## Experiment 1: Word similarity evaluation
We evaluate the CN post-processed word vectors with 7 standard word similarity datasets: the RG65 (Rubenstein and Goodenough, 1965), the WordSim-353 (WS) (Finkelstein et al., 2002), the rare- words (RW) (Luong, Socher, and Manning, 2013), the MEN dataset (Bruni, Tran, and Baroni, 2014), the MTurk (Radinsky et al., 2011), the SimLex-999 (SimLex) (Hill, Reichart, and Korhonen, 2015), and the SimVerb-3500 (Gerz et al., 2016). 

To evaluate the word similarity, we calculate the cosine distance between vectors of two words. We report the Spearman’s rank correlation coefficient (Myers and Well, 1995) of the estimated rankings against the rankings given by human annotators.


In [12]:
!git clone https://github.com/mfaruqui/eval-word-vectors

!mkdir wordSimData
!mv eval-word-vectors/data/word-sim/ wordSimData/

Cloning into 'eval-word-vectors'...
remote: Enumerating objects: 54, done.[K
Unpacking objects:   1% (1/54)   Unpacking objects:   3% (2/54)   Unpacking objects:   5% (3/54)   Unpacking objects:   7% (4/54)   Unpacking objects:   9% (5/54)   Unpacking objects:  11% (6/54)   Unpacking objects:  12% (7/54)   Unpacking objects:  14% (8/54)   Unpacking objects:  16% (9/54)   Unpacking objects:  18% (10/54)   Unpacking objects:  20% (11/54)   Unpacking objects:  22% (12/54)   Unpacking objects:  24% (13/54)   Unpacking objects:  25% (14/54)   Unpacking objects:  27% (15/54)   Unpacking objects:  29% (16/54)   Unpacking objects:  31% (17/54)   Unpacking objects:  33% (18/54)   Unpacking objects:  35% (19/54)   Unpacking objects:  37% (20/54)   Unpacking objects:  38% (21/54)   Unpacking objects:  40% (22/54)   Unpacking objects:  42% (23/54)   Unpacking objects:  44% (24/54)   Unpacking objects:  46% (25/54)   remote: Total 54 (delta 0), reused 0 (delta 0), pack-reu

In [0]:
dataSets = ['EN-RG-65.txt', 'EN-WS-353-ALL.txt', 'EN-RW-STANFORD.txt', 'EN-MEN-TR-3k.txt', 'EN-MTurk-287.txt', 'EN-SIMLEX-999.txt', 'EN-SimVerb-3500.txt']



def similarity_eval(dataSetAddress, wordVecModel_str, all_word):
    wordVecModel = eval(wordVecModel_str)
    all_word = eval(all_word)
    vocab = set(list(all_word.keys()))
    
    fread_simlex = open(dataSetAddress, "r")
    
    pair_list = []

    line_number = 0
    for line in fread_simlex:
#         if line_number > 0:
        tokens = line.split()
        word_i = tokens[0]
        word_j = tokens[1]
        score = float(tokens[2])
        if word_i in vocab and word_j in vocab:
            pair_list.append( ((word_i, word_j), score) )
#         line_number += 1

    pair_list.sort(key=lambda x: - x[1]) # order the pairs from highest score (most similar) to lowest score (least similar)


    extracted_scores = {}

    extracted_list = []
    
               
    for (x,y) in pair_list:
        (word_i, word_j) = x
        
        current_distance = 1- cosine_similarity( wordVecModel[word_i].reshape(1,-1)  , wordVecModel[word_j].reshape(1,-1) )        

        extracted_scores[(word_i, word_j)] = current_distance
        extracted_list.append(((word_i, word_j), current_distance))

    extracted_list.sort(key=lambda x: x[1])

    spearman_original_list = []
    spearman_target_list = []

    for position_1, (word_pair, score_1) in enumerate(pair_list):
        score_2 = extracted_scores[word_pair]
        position_2 = extracted_list.index((word_pair, score_2))
        spearman_original_list.append(position_1)
        spearman_target_list.append(position_2)

    spearman_rho = spearmanr(spearman_original_list, spearman_target_list)
    
    return spearman_rho[0]


In [16]:
vocab = set(list(glove.wv.vocab))
print(vocab)

  """Entry point for launching an IPython kernel.
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
wordSimResult = {}


for dataset in dataSets:
    dataSetAddress = resourceFile + 'wordSimData/word-sim/' +  dataset
    print('evaluating the data set', dataset)
    
    print('Glove : %.4f' %  similarity_eval(dataSetAddress, 'fasttext', 'all_words_index'))
    print('Glove + CN : %.4f' %  similarity_eval(dataSetAddress, 'all_words', 'all_words'))
        
    print('\n')
    

evaluating the data set EN-RG-65.txt
Glove : 0.8587
Glove + CN : 0.8594


evaluating the data set EN-WS-353-ALL.txt
Glove : 0.7882
Glove + CN : 0.7744


evaluating the data set EN-RW-STANFORD.txt
Glove : 0.6217
Glove + CN : 0.6248


evaluating the data set EN-MEN-TR-3k.txt
Glove : 0.8364
Glove + CN : 0.8264


evaluating the data set EN-MTurk-287.txt
Glove : 0.7245
Glove + CN : 0.7134


evaluating the data set EN-SIMLEX-999.txt
Glove : 0.5055
Glove + CN : 0.5078


evaluating the data set EN-SimVerb-3500.txt
Glove : 0.4275
Glove + CN : 0.4272




## Experiment 2:  Semantic Textual Similarity (STS) tasks

We use standard semantic textual similarity (STS) benchmarks to evaluate the post-processed word vectors: we use 2012-2015 SemEval STS tasks (Agirre et al., 2012, 2013, 2014, 2015) and 2012 SemEval Semantic Related task (SICK) (Marelli et al., 2014). 

We reuse the codes provided in  https://github.com/nlptown/nlp-notebooks/blob/master/Simple%20Sentence%20Similarity.ipynb



### Load datasets

In [16]:
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-dev.csv
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-mt.csv
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-other.csv
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-test.csv
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-train.csv

--2019-04-17 04:43:51--  https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-dev.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255680 (250K) [text/plain]
Saving to: ‘sts-dev.csv’


2019-04-17 04:43:51 (9.67 MB/s) - ‘sts-dev.csv’ saved [255680/255680]

--2019-04-17 04:43:52--  https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-mt.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 513141 (501K) [text/plain]
Saving to: ‘sts-mt.csv’


2019-04

In [17]:
def load_sts_dataset(filename):
    # For a STS dataset, loads the relevant information: the sentences and their human rated similarity score.
    sent_pairs = []
    with tf.gfile.GFile(filename, "r") as f:
        for line in f:
            ts = line.strip().split("\t")
            if len(ts) == 7 or len(ts) == 9:
                sent_pairs.append((re.sub("[^0-9]", "", ts[2]) + '-' + ts[1] , ts[5], ts[6], float(ts[4])))
            elif len(ts) == 6 or len(ts) == 8:
                sent_pairs.append((re.sub("[^0-9]", "", ts[1]) + '-' + ts[0] , ts[4], ts[5], float(ts[3])))
            else:
                print('data format is wrong!!!')
    return pd.DataFrame(sent_pairs, columns=["year-task", "sent_1", "sent_2", "sim"])


def load_all_sts_dataset():
    # Loads all of the STS datasets 
    stsbenchmarkDir = ''
    stscompanionDir = ''
    sts_train = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-train.csv"))    
    sts_dev = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-dev.csv"))
    sts_test = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-test.csv"))
    sts_other = load_sts_dataset(os.path.join(stscompanionDir, "sts-other.csv"))
    sts_mt = load_sts_dataset(os.path.join(stscompanionDir, "sts-mt.csv"))
    
    sts_all = pd.concat([sts_train, sts_dev, sts_test, sts_other, sts_mt ])
    
    return sts_all

sts_all = load_all_sts_dataset()





def load_sts_by_year_task():
    # Divide STS datasets based on their year and tasks
    sts_by_year_task = {}
    
    for year_task in sts_all['year-task'].unique():
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x == year_task]
        
        pairs = sts_all.iloc[indices]
        
        sts_by_year_task[year_task] = pairs
        
    return sts_by_year_task

sts_by_year_task = load_sts_by_year_task()




def load_sts_by_year():
    # Divide STS datasets ONLY based on their year (different tasks in that year are merged).

    sts_by_year = {}
    
    for year in ['2012', '2013', '2014', '2015', '2016', '2017']:
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x.startswith(year)]
        
        pairs = sts_all.iloc[indices]
        pairs = pairs.copy()
        pairs['year-task'] = year
        sts_by_year[year] = pairs
        
    return sts_by_year

sts_by_year_task = load_sts_by_year_task()

sts_by_year = load_sts_by_year()


# filename = resourceFile + '2015-answers-students.test.tsv'
# sent_pairs = []
# with tf.gfile.GFile(filename, "r") as f:
#     for line in f:
#         ts = line.strip().split("\t")
#         if len(ts) == 3:
#             sent_pairs.append((ts[1], ts[2], float(ts[0])))
# answers_students_2015 =  pd.DataFrame(sent_pairs, columns=["sent_1", "sent_2", "sim"])


# show some sample sts data    
sts_all[:5] 


Unnamed: 0,year-task,sent_1,sent_2,sim
0,2012-MSRvid,A plane is taking off.,An air plane is taking off.,5.0
1,2012-MSRvid,A man is playing a large flute.,A man is playing a flute.,3.8
2,2012-MSRvid,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8
3,2012-MSRvid,Three men are playing chess.,Two men are playing chess.,2.6
4,2012-MSRvid,A man is playing the cello.,A man seated is playing the cello.,4.25


In [18]:


def download_sick(f): 

    response = requests.get(f).text

    lines = response.split("\n")[1:]
    lines = [l.split("\t") for l in lines if len(l) > 0]
    lines = [l for l in lines if len(l) == 5]

    df = pd.DataFrame(lines, columns=["idx", "sent_1", "sent_2", "sim", "label"])
    df['sim'] = pd.to_numeric(df['sim'])
    return df
    
sick_all = download_sick("https://raw.githubusercontent.com/alvations/stasis/master/SICK-data/SICK_test_annotated.txt")

sick_all[:5]

Unnamed: 0,idx,sent_1,sent_2,sim,label
0,6,There is no boy playing outdoors and there is ...,A group of kids is playing in a yard and an ol...,3.3,NEUTRAL\r
1,7,A group of boys in a yard is playing and a man...,The young boys are playing outdoors and the ma...,3.7,NEUTRAL\r
2,8,A group of children is playing in the house an...,The young boys are playing outdoors and the ma...,3.0,NEUTRAL\r
3,10,A brown dog is attacking another animal in fro...,A brown dog is attacking another animal in fro...,4.9,ENTAILMENT\r
4,11,A brown dog is attacking another animal in fro...,A brown dog is helping another animal in front...,3.665,NEUTRAL\r


### Some preparation for STS evaluation

In [0]:

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        
def run_conceptor_benchmark(sentences1, sentences2, model_str): 
    
    model = eval(model_str)
    embeddings = []


    for (sent1, sent2) in zip(sentences1, sentences2): 

        tokens1 =  sent1.tokens
        tokens2 =  sent2.tokens

        tokens1 = [token for token in tokens1 if token in model and token.islower()]
        tokens2 = [token for token in tokens2 if token in model and token.islower()]

        embedding1 = np.average([model[token] for token in tokens1], axis=0)
        embedding2 = np.average([model[token] for token in tokens2], axis=0)



        if isinstance(embedding1, float) or isinstance(embedding2, float):
            embeddings.append(np.zeros(300))
            embeddings.append(np.zeros(300))
        else:
            embeddings.append(embedding1)
            embeddings.append(embedding2)



    sims = [cosine_similarity(embeddings[idx*2].reshape(1, -1), embeddings[idx*2+1].reshape(1, -1))[0][0] for idx in range(int(len(embeddings)/2))]
    return sims

def run_experiment(df, benchmarks): 
    
    sentences1 = [Sentence(s) for s in df['sent_1']]
    sentences2 = [Sentence(s) for s in df['sent_2']]
    
    pearson_cors, spearman_cors = [], []
    for label, method in benchmarks:
        sims = method(sentences1, sentences2)
        pearson_correlation = round(scipy.stats.pearsonr(sims, df['sim'])[0] * 100,2)
        #print(label, pearson_correlation)
        pearson_cors.append(pearson_correlation)
        
    return pearson_cors

### Do STS evaluation

Note that results below are a bit different from what has been reported in the appendix of our paper because we are using a small word2vec for demonstration purpose here -- there are more out-of-vocabulary words. 

In [21]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt
    Downloading package punkt to /root/nltk_data...
      Unzipping tokenizers/punkt.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [25]:

benchmarks = [("Word2vec", ft.partial(run_conceptor_benchmark, model_str= 'word2vec')),    
             ("CN-Word2vec", ft.partial(run_conceptor_benchmark, model_str= 'all_words'))]

pearson_results_year_task = {}

for year_task in sts_all['year-task'].unique():
    print('STS-' + year_task)
    pearson_results_year_task['STS-' + year_task] = run_experiment(sts_by_year_task[year_task], benchmarks)  
    
pearson_results_year_task['SICK'] = run_experiment(sick_all, benchmarks) 
# pearson_results_year_task['TWITTER'] = run_experiment(twitter_all, benchmarks) 

# pearson_results_year_task['2015-answers_students'] = run_experiment(answers_students_2015, benchmarks) 

STS-2012-MSRvid
STS-2014-images
STS-2015-images
STS-2014-deft-forum
STS-2012-MSRpar
STS-2014-deft-news
STS-2013-headlines
STS-2014-headlines
STS-2015-headlines
STS-2016-headlines
STS-2017-track5.en-en
STS-2015-answers-forums
STS-2016-answer-answer
STS-2012-surprise.OnWN
STS-2013-FNWN
STS-2013-OnWN
STS-2014-OnWN
STS-2014-tweet-news
STS-2015-belief
STS-2016-plagiarism
STS-2016-question-question
STS-2012-SMTeuroparl


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


STS-2012-surprise.SMTnews
STS-2016-postediting


In [26]:

# plt.rcParams['figure.figsize'] = (10,5)

pearson_results_year_task_df = pd.DataFrame(pearson_results_year_task)
pearson_results_year_task_df = pearson_results_year_task_df.transpose()
pearson_results_year_task_df = pearson_results_year_task_df.rename(columns={i:b[0] for i, b in enumerate(benchmarks)})

pearson_results_year_task_df.reindex(['STS-2012-MSRpar', 'STS-2012-MSRvid', 'STS-2012-surprise.OnWN', 'STS-2012-SMTeuroparl', 'STS-2012-surprise.SMTnews','STS-2013-FNWN', 'STS-2013-OnWN', 'STS-2013-headlines',  'STS-2014-OnWN', 'STS-2014-deft-forum','STS-2014-deft-news', 'STS-2014-headlines', 'STS-2014-tweet-news',  'STS-2014-images', 'STS-2015-answers-forums', '2015-answers_students', 'STS-2015-belief',  'STS-2015-headlines', 'STS-2015-images', 'SICK'])





Unnamed: 0,Word2vec,CN-Word2vec
STS-2012-MSRpar,41.61,41.0
STS-2012-MSRvid,76.44,75.61
STS-2012-surprise.OnWN,70.85,70.85
STS-2012-SMTeuroparl,31.48,32.61
STS-2012-surprise.SMTnews,53.25,53.2
STS-2013-FNWN,40.92,40.68
STS-2013-OnWN,68.17,68.74
STS-2013-headlines,64.71,64.34
STS-2014-OnWN,75.09,75.52
STS-2014-deft-forum,40.12,41.88


## Experiment 3: Concept Categorization


In the concept categorization task, we used k-means to cluster words into concept cate- gories based on their vector representations (for example, “bear” and “cat” belong to the concept category of animals). We use three standard datasets: (i) a rather small dataset ESSLLI 2008 (Baroni, Evert, and Lenci, 2008) that contains 44 concepts in 9 categories; (ii) the Almuhareb-Poesio (AP) (Poesio and Almuhareb, 2005), which contains 402 concepts divided into 21 categories; and (iii) the BM dataset (Bat- tig and Montague, 1969) that 5321 concepts divided into 56 categories. Note that the datasets of ESSLLI, AP, and BM are increasingly challenging for clustering algorithms, due to the increasing numbers of words and categories.


In [0]:
def calculate_purity(y_true, y_pred):
    """
    Calculate purity for given true and predicted cluster labels.
    Parameters
    ----------
    y_true: array, shape: (n_samples, 1)
      True cluster labels
    y_pred: array, shape: (n_samples, 1)
      Cluster assingment.
    Returns
    -------
    purity: float
      Calculated purity.
    """
    assert len(y_true) == len(y_pred)
    true_clusters = np.zeros(shape=(len(set(y_true)), len(y_true)))
    pred_clusters = np.zeros_like(true_clusters)
    for id, cl in enumerate(set(y_true)):
        true_clusters[id] = (y_true == cl).astype("int")
    for id, cl in enumerate(set(y_pred)):
        pred_clusters[id] = (y_pred == cl).astype("int")

    M = pred_clusters.dot(true_clusters.T)
    return 1. / len(y_true) * np.sum(np.max(M, axis=1))

def evaluateCategorization(thisDict_str, testDataset_csv, method = 'fixed'):
    
    categorizationFile = resourceFile + 'word-categorization/monolingual/en/' + testDataset_csv

    
    thisDict = eval(thisDict_str)
    modelVocab = list(thisDict.keys())

    categorty_list = []
    word_list = []

    with open(categorizationFile, newline='') as csvfile:
        next(csvfile)
        reader = csv.reader(csvfile, quotechar='|')
        for row in reader:
            if len(row[2]) != 0 and row[2] in modelVocab:
                categorty_list.append(row[1])
                word_list.append(row[2])


    wordVectorsMat = np.array([thisDict[word] for word in word_list])

    initCentroids = []
    for category in set(categorty_list):
        indicesCategory = [i for i in range(len(categorty_list)) if categorty_list[i]== category]
        initCentroid = np.mean(wordVectorsMat[indicesCategory, :], axis = 0)
        initCentroids.append(initCentroid)

    initCentroids = np.array(initCentroids)

    if method == 'fixed':
    
        predClusters = KMeans(init = initCentroids, n_clusters=len(set(categorty_list))).fit_predict(wordVectorsMat)
        purity = calculate_purity(np.array(categorty_list), predClusters)

    else:
        
        predClusters = KMeans(n_init=10000, n_clusters=len(set(categorty_list))).fit_predict(wordVectorsMat)
        purity= calculate_purity(np.array(categorty_list), predClusters)
        
        
    return purity

In [0]:
wordVecBrands_methods = ['cn_word2vec', 'cn_glove'] 
csvFile = ['battig.csv', 'ap.csv', 'essli-2008.csv']


c = list(itertools.product(csvFile, wordVecBrands_methods))


In [0]:
all_purity = []
for (csvFile, wordVecBrand_method) in c:
    print(wordVecBrand_method + '-' + csvFile)
    print(round(evaluateCategorization(wordVecBrand_method, csvFile) * 100,2) )

cn_word2vec-battig.csv


  return_n_iter=True)


60.19
cn_glove-battig.csv
67.63
cn_word2vec-ap.csv
89.31
cn_glove-ap.csv
90.95
cn_word2vec-essli-2008.csv
100.0
cn_glove-essli-2008.csv
100.0
