# Unsupervised Post-processing of Word Vectors via Conceptor Negation

In this notebook, we presents the experiment results reported in [1]/

[1] Unsupervised Post-processing of Word Vectors via Conceptor Negation. Tianlin Liu, Lyle Ungar, and João Sedoc, Unsupervised Post-processing of Word Vectors via Conceptor Negation, AAAI 2019.





In [1]:

import numpy as np
import scipy, requests, codecs, os, re, nltk, itertools, csv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering, KMeans
import tensorflow as tf
from scipy.stats import spearmanr
import pandas as pd
import functools as ft

# resourceFile = '/Users/liutianlin/Desktop/Academics/NLP/data/' 
resourceFile = '/data/' # the address of the datasets


In [None]:
!pip install -q gdown
!gdown https://drive.google.com/uc?id=1U_UGB2vyTuTIcbV_oeDtJCtAtlFMvXOM # download a small subset of glove
!gdown https://drive.google.com/uc?id=1j_b4TRpL3f0HQ8mV17_CtOXp862YjxxB   # download a small subset of word2vec

## Load Word2Vec and GloVe word embeddings
 
We provide a small word2vec and small glove word embedding in this repository -- their words appear at least 200 times in wikipedia (see the list provided by Arora et al https://github.com/PrincetonML/SIF/blob/master/auxiliary_data/enwiki_vocab_min200.txt

In [2]:
def loadWordVecs(model_str):
    word_dictionary = {}
    
    input_file_destination = 'small_' + model_str + '.txt'

    f = codecs.open(input_file_destination, 'r', 'utf-8') 

    for line in f:

        line = line.split(" ", 1)   
        transformed_key = line[0].lower()

        try:
            transformed_key = str(transformed_key)

        except:
            print("Can't convert the key to unicode:", transformed_key)

        word_dictionary[transformed_key] = np.fromstring(line[1], dtype="float32", sep=" ")

        if word_dictionary[transformed_key].shape[0] != 300:
            print(transformed_key, word_dictionary[transformed_key].shape)

    return  word_dictionary     



orig_word2vec = loadWordVecs('word2vec')
print("loaded Word2vec!")

orig_glove = loadWordVecs('glove')
print("loaded GloVe Common Crawl!")


orig_model = {}
orig_model['word2vec'] = orig_word2vec
orig_model['glove'] = orig_glove


loaded Word2vec!
loaded GloVe Common Crawl!


## Post-process Word2Vec and GloVe with Conceptor Negation (CN)


In [3]:
def ensemble_cn_dict(wordVecModel_str, alpha = 2, orig_model = orig_model):
    
    
    # put the word vectors in columns
    x_collector = np.array(list(orig_model[wordVecModel_str].values())).T       
        
    
    nrWords = x_collector.shape[1] # number of total words
    
    
    R = x_collector.dot(x_collector.T) / nrWords # calculate the un-centered correlation matrix
    
    C = R @ np.linalg.inv(R + alpha ** (-2) * np.eye(300))# calculate the conceptor matrix
    
    vecMatrix = ((np.eye(300) - C) @ x_collector).T 

    cn_dict = {}
        
    for word_index in np.arange(0, len(orig_model[wordVecModel_str].keys())):
        
        word = list(orig_model[wordVecModel_str].keys())[word_index]
        cn_dict[word] = vecMatrix[word_index,:]
    
    return cn_dict

print("Post-processing Word2vec with CN")
cn_word2vec = ensemble_cn_dict('word2vec', orig_model = orig_model)

print("Post-processing GloVe with CN")
cn_glove = ensemble_cn_dict('glove', orig_model = orig_model)


Post-processing Word2vec with CN
Post-processing GloVe with CN


## Experiment 1: Word similarity evaluation
We evaluate the CN post-processed word vectors with 7 standard word similarity datasets: the RG65 (Rubenstein and Goodenough, 1965), the WordSim-353 (WS) (Finkelstein et al., 2002), the rare- words (RW) (Luong, Socher, and Manning, 2013), the MEN dataset (Bruni, Tran, and Baroni, 2014), the MTurk (Radinsky et al., 2011), the SimLex-999 (SimLex) (Hill, Reichart, and Korhonen, 2015), and the SimVerb-3500 (Gerz et al., 2016). 

To evaluate the word similarity, we calculate the cosine distance between vectors of two words. We report the Spearman’s rank correlation coefficient (Myers and Well, 1995) of the estimated rankings against the rankings given by human annotators.


In [4]:
dataSets = ['EN-RG-65.txt', 'EN-WS-353-ALL.txt', 'EN-RW-STANFORD.txt', 'EN-MEN-TR-3k.txt', 'EN-MTurk-287.txt', 'EN-SIMLEX-999.txt', 'EN-SimVerb-3500.txt']



def similarity_eval(dataSetAddress, wordVecModel_str):
    wordVecModel = eval(wordVecModel_str)
    vocab = set(list(wordVecModel.keys()))
    
    fread_simlex = open(dataSetAddress, "r")
    
    pair_list = []

    line_number = 0
    for line in fread_simlex:
#         if line_number > 0:
        tokens = line.split()
        word_i = tokens[0]
        word_j = tokens[1]
        score = float(tokens[2])
        if word_i in vocab and word_j in vocab:
            pair_list.append( ((word_i, word_j), score) )
#         line_number += 1

    pair_list.sort(key=lambda x: - x[1]) # order the pairs from highest score (most similar) to lowest score (least similar)


    extracted_scores = {}

    extracted_list = []
    
               
    for (x,y) in pair_list:
        (word_i, word_j) = x
        
        current_distance = 1- cosine_similarity( wordVecModel[word_i].reshape(1,-1)  , wordVecModel[word_j].reshape(1,-1) )        

        extracted_scores[(word_i, word_j)] = current_distance
        extracted_list.append(((word_i, word_j), current_distance))

    extracted_list.sort(key=lambda x: x[1])

    spearman_original_list = []
    spearman_target_list = []

    for position_1, (word_pair, score_1) in enumerate(pair_list):
        score_2 = extracted_scores[word_pair]
        position_2 = extracted_list.index((word_pair, score_2))
        spearman_original_list.append(position_1)
        spearman_target_list.append(position_2)

    spearman_rho = spearmanr(spearman_original_list, spearman_target_list)
    
    return spearman_rho[0]


In [5]:
wordSimResult = {}


for dataset in dataSets:
    dataSetAddress = resourceFile + 'wordSimData/' +  dataset
    print('evaluating the data set', dataset)
    
    print('Word2Vec + CN : %.4f' %  similarity_eval(dataSetAddress, 'cn_word2vec'))
    print('Glove + CN : %.4f' %  similarity_eval(dataSetAddress, 'cn_glove'))
        
    print('\n')
    

evaluating the data set EN-RG-65.txt
Word2Vec + CN : 0.7972
Glove + CN : 0.7913


evaluating the data set EN-WS-353-ALL.txt
Word2Vec + CN : 0.6926
Glove + CN : 0.7886


evaluating the data set EN-RW-STANFORD.txt
Word2Vec + CN : 0.5804
Glove + CN : 0.5898


evaluating the data set EN-MEN-TR-3k.txt
Word2Vec + CN : 0.7869
Glove + CN : 0.8339


evaluating the data set EN-MTurk-287.txt
Word2Vec + CN : 0.6662
Glove + CN : 0.7116


evaluating the data set EN-SIMLEX-999.txt
Word2Vec + CN : 0.4684
Glove + CN : 0.4858


evaluating the data set EN-SimVerb-3500.txt
Word2Vec + CN : 0.3830
Glove + CN : 0.3632




## Experiment 2:  Semantic Textual Similarity (STS) tasks

We use standard semantic textual similarity (STS) benchmarks to evaluate the post-processed word vectors: we use 2012-2015 SemEval STS tasks (Agirre et al., 2012, 2013, 2014, 2015) and 2012 SemEval Semantic Related task (SICK) (Marelli et al., 2014). 

We reuse the codes provided in  https://github.com/nlptown/nlp-notebooks/blob/master/Simple%20Sentence%20Similarity.ipynb



### Load datasets

In [10]:
def load_sts_dataset(filename):
    # For a STS dataset, loads the relevant information: the sentences and their human rated similarity score.
    sent_pairs = []
    with tf.gfile.GFile(filename, "r") as f:
        for line in f:
            ts = line.strip().split("\t")
            if len(ts) == 7 or len(ts) == 9:
                sent_pairs.append((re.sub("[^0-9]", "", ts[2]) + '-' + ts[1] , ts[5], ts[6], float(ts[4])))
            elif len(ts) == 6 or len(ts) == 8:
                sent_pairs.append((re.sub("[^0-9]", "", ts[1]) + '-' + ts[0] , ts[4], ts[5], float(ts[3])))
            else:
                print('data format is wrong!!!')
    return pd.DataFrame(sent_pairs, columns=["year-task", "sent_1", "sent_2", "sim"])


def load_all_sts_dataset():
    # Loads all of the STS datasets 
    stsbenchmarkDir = resourceFile + 'stsbenchmark/'
    stscompanionDir = resourceFile + 'stsbenchmark/'
    sts_train = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-train.csv"))    
    sts_dev = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-dev.csv"))
    sts_test = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-test.csv"))
    sts_other = load_sts_dataset(os.path.join(stscompanionDir, "sts-other.csv"))
    sts_mt = load_sts_dataset(os.path.join(stscompanionDir, "sts-mt.csv"))
    
    sts_all = pd.concat([sts_train, sts_dev, sts_test, sts_other, sts_mt ])
    
    return sts_all

sts_all = load_all_sts_dataset()





def load_sts_by_year_task():
    # Divide STS datasets based on their year and tasks
    sts_by_year_task = {}
    
    for year_task in sts_all['year-task'].unique():
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x == year_task]
        
        pairs = sts_all.iloc[indices]
        
        sts_by_year_task[year_task] = pairs
        
    return sts_by_year_task

sts_by_year_task = load_sts_by_year_task()




def load_sts_by_year():
    # Divide STS datasets ONLY based on their year (different tasks in that year are merged).

    sts_by_year = {}
    
    for year in ['2012', '2013', '2014', '2015', '2016', '2017']:
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x.startswith(year)]
        
        pairs = sts_all.iloc[indices]
        pairs = pairs.copy()
        pairs['year-task'] = year
        sts_by_year[year] = pairs
        
    return sts_by_year

sts_by_year_task = load_sts_by_year_task()

sts_by_year = load_sts_by_year()


filename = resourceFile + '2015-answers-students.test.tsv'
sent_pairs = []
with tf.gfile.GFile(filename, "r") as f:
    for line in f:
        ts = line.strip().split("\t")
        if len(ts) == 3:
            sent_pairs.append((ts[1], ts[2], float(ts[0])))
answers_students_2015 =  pd.DataFrame(sent_pairs, columns=["sent_1", "sent_2", "sim"])


# show some sample sts data    
sts_all[:5] 


Unnamed: 0,year-task,sent_1,sent_2,sim
0,2012-MSRvid,A plane is taking off.,An air plane is taking off.,5.0
1,2012-MSRvid,A man is playing a large flute.,A man is playing a flute.,3.8
2,2012-MSRvid,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8
3,2012-MSRvid,Three men are playing chess.,Two men are playing chess.,2.6
4,2012-MSRvid,A man is playing the cello.,A man seated is playing the cello.,4.25


In [11]:


def download_sick(f): 

    response = requests.get(f).text

    lines = response.split("\n")[1:]
    lines = [l.split("\t") for l in lines if len(l) > 0]
    lines = [l for l in lines if len(l) == 5]

    df = pd.DataFrame(lines, columns=["idx", "sent_1", "sent_2", "sim", "label"])
    df['sim'] = pd.to_numeric(df['sim'])
    return df
    
sick_all = download_sick("https://raw.githubusercontent.com/alvations/stasis/master/SICK-data/SICK_test_annotated.txt")

sick_all[:5]

Unnamed: 0,idx,sent_1,sent_2,sim,label
0,6,There is no boy playing outdoors and there is ...,A group of kids is playing in a yard and an ol...,3.3,NEUTRAL\r
1,7,A group of boys in a yard is playing and a man...,The young boys are playing outdoors and the ma...,3.7,NEUTRAL\r
2,8,A group of children is playing in the house an...,The young boys are playing outdoors and the ma...,3.0,NEUTRAL\r
3,10,A brown dog is attacking another animal in fro...,A brown dog is attacking another animal in fro...,4.9,ENTAILMENT\r
4,11,A brown dog is attacking another animal in fro...,A brown dog is helping another animal in front...,3.665,NEUTRAL\r


### Some preparation for STS evaluation

In [12]:

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        
def run_conceptor_benchmark(sentences1, sentences2, model_str): 
    
    model = eval(model_str)
    embeddings = []


    for (sent1, sent2) in zip(sentences1, sentences2): 

        tokens1 =  sent1.tokens
        tokens2 =  sent2.tokens

        tokens1 = [token for token in tokens1 if token in model and token.islower()]
        tokens2 = [token for token in tokens2 if token in model and token.islower()]

        embedding1 = np.average([model[token] for token in tokens1], axis=0)
        embedding2 = np.average([model[token] for token in tokens2], axis=0)



        if isinstance(embedding1, float) or isinstance(embedding2, float):
            embeddings.append(np.zeros(300))
            embeddings.append(np.zeros(300))
        else:
            embeddings.append(embedding1)
            embeddings.append(embedding2)



    sims = [cosine_similarity(embeddings[idx*2].reshape(1, -1), embeddings[idx*2+1].reshape(1, -1))[0][0] for idx in range(int(len(embeddings)/2))]
    return sims

def run_experiment(df, benchmarks): 
    
    sentences1 = [Sentence(s) for s in df['sent_1']]
    sentences2 = [Sentence(s) for s in df['sent_2']]
    
    pearson_cors, spearman_cors = [], []
    for label, method in benchmarks:
        sims = method(sentences1, sentences2)
        pearson_correlation = round(scipy.stats.pearsonr(sims, df['sim'])[0] * 100,2)
        #print(label, pearson_correlation)
        pearson_cors.append(pearson_correlation)
        
    return pearson_cors

### Do STS evaluation

Note that results below are a bit different from what has been reported in the appendix of our paper because we are using a small word2vec for demonstration purpose here -- there are more out-of-vocabulary words. 

In [13]:

benchmarks = [("CN-glove", ft.partial(run_conceptor_benchmark, model_str= 'cn_glove')),    
             ("CN-word2vec", ft.partial(run_conceptor_benchmark, model_str= 'cn_word2vec'))]

pearson_results_year_task = {}

for year_task in sts_all['year-task'].unique():
    print('STS-' + year_task)
    pearson_results_year_task['STS-' + year_task] = run_experiment(sts_by_year_task[year_task], benchmarks)  
    
pearson_results_year_task['SICK'] = run_experiment(sick_all, benchmarks) 
# pearson_results_year_task['TWITTER'] = run_experiment(twitter_all, benchmarks) 

pearson_results_year_task['2015-answers_students'] = run_experiment(answers_students_2015, benchmarks) 

STS-2012-MSRvid
STS-2014-images
STS-2015-images
STS-2014-deft-forum
STS-2012-MSRpar
STS-2014-deft-news
STS-2013-headlines
STS-2014-headlines
STS-2015-headlines
STS-2016-headlines
STS-2017-track5.en-en
STS-2015-answers-forums
STS-2016-answer-answer
STS-2012-surprise.OnWN
STS-2013-FNWN
STS-2013-OnWN
STS-2014-OnWN
STS-2014-tweet-news
STS-2015-belief
STS-2016-plagiarism
STS-2016-question-question
STS-2012-SMTeuroparl


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


STS-2012-surprise.SMTnews
STS-2016-postediting


In [14]:

# plt.rcParams['figure.figsize'] = (10,5)

pearson_results_year_task_df = pd.DataFrame(pearson_results_year_task)
pearson_results_year_task_df = pearson_results_year_task_df.transpose()
pearson_results_year_task_df = pearson_results_year_task_df.rename(columns={i:b[0] for i, b in enumerate(benchmarks)})

pearson_results_year_task_df.reindex(['STS-2012-MSRpar', 'STS-2012-MSRvid', 'STS-2012-surprise.OnWN', 'STS-2012-SMTeuroparl', 'STS-2012-surprise.SMTnews','STS-2013-FNWN', 'STS-2013-OnWN', 'STS-2013-headlines',  'STS-2014-OnWN', 'STS-2014-deft-forum','STS-2014-deft-news', 'STS-2014-headlines', 'STS-2014-tweet-news',  'STS-2014-images', 'STS-2015-answers-forums', '2015-answers_students', 'STS-2015-belief',  'STS-2015-headlines', 'STS-2015-images', 'SICK'])





Unnamed: 0,CN-glove,CN-word2vec
STS-2012-MSRpar,41.27,40.37
STS-2012-MSRvid,62.5,75.22
STS-2012-surprise.OnWN,67.87,70.82
STS-2012-SMTeuroparl,52.58,35.14
STS-2012-surprise.SMTnews,47.69,50.08
STS-2013-FNWN,42.03,43.99
STS-2013-OnWN,57.45,68.76
STS-2013-headlines,67.0,64.78
STS-2014-OnWN,66.43,75.08
STS-2014-deft-forum,37.57,42.8


## Experiment 3: Concept Categorization


In the concept categorization task, we used k-means to cluster words into concept cate- gories based on their vector representations (for example, “bear” and “cat” belong to the concept category of animals). We use three standard datasets: (i) a rather small dataset ESSLLI 2008 (Baroni, Evert, and Lenci, 2008) that contains 44 concepts in 9 categories; (ii) the Almuhareb-Poesio (AP) (Poesio and Almuhareb, 2005), which contains 402 concepts divided into 21 categories; and (iii) the BM dataset (Bat- tig and Montague, 1969) that 5321 concepts divided into 56 categories. Note that the datasets of ESSLLI, AP, and BM are increasingly challenging for clustering algorithms, due to the increasing numbers of words and categories.


In [15]:
def calculate_purity(y_true, y_pred):
    """
    Calculate purity for given true and predicted cluster labels.
    Parameters
    ----------
    y_true: array, shape: (n_samples, 1)
      True cluster labels
    y_pred: array, shape: (n_samples, 1)
      Cluster assingment.
    Returns
    -------
    purity: float
      Calculated purity.
    """
    assert len(y_true) == len(y_pred)
    true_clusters = np.zeros(shape=(len(set(y_true)), len(y_true)))
    pred_clusters = np.zeros_like(true_clusters)
    for id, cl in enumerate(set(y_true)):
        true_clusters[id] = (y_true == cl).astype("int")
    for id, cl in enumerate(set(y_pred)):
        pred_clusters[id] = (y_pred == cl).astype("int")

    M = pred_clusters.dot(true_clusters.T)
    return 1. / len(y_true) * np.sum(np.max(M, axis=1))

def evaluateCategorization(thisDict_str, testDataset_csv, method = 'fixed'):
    
    categorizationFile = resourceFile + 'word-categorization/monolingual/en/' + testDataset_csv

    
    thisDict = eval(thisDict_str)
    modelVocab = list(thisDict.keys())

    categorty_list = []
    word_list = []

    with open(categorizationFile, newline='') as csvfile:
        next(csvfile)
        reader = csv.reader(csvfile, quotechar='|')
        for row in reader:
            if len(row[2]) != 0 and row[2] in modelVocab:
                categorty_list.append(row[1])
                word_list.append(row[2])


    wordVectorsMat = np.array([thisDict[word] for word in word_list])

    initCentroids = []
    for category in set(categorty_list):
        indicesCategory = [i for i in range(len(categorty_list)) if categorty_list[i]== category]
        initCentroid = np.mean(wordVectorsMat[indicesCategory, :], axis = 0)
        initCentroids.append(initCentroid)

    initCentroids = np.array(initCentroids)

    if method == 'fixed':
    
        predClusters = KMeans(init = initCentroids, n_clusters=len(set(categorty_list))).fit_predict(wordVectorsMat)
        purity = calculate_purity(np.array(categorty_list), predClusters)

    else:
        
        predClusters = KMeans(n_init=10000, n_clusters=len(set(categorty_list))).fit_predict(wordVectorsMat)
        purity= calculate_purity(np.array(categorty_list), predClusters)
        
        
    return purity

In [16]:
wordVecBrands_methods = ['cn_word2vec', 'cn_glove'] 
csvFile = ['battig.csv', 'ap.csv', 'essli-2008.csv']


c = list(itertools.product(csvFile, wordVecBrands_methods))


In [17]:
all_purity = []
for (csvFile, wordVecBrand_method) in c:
    print(wordVecBrand_method + '-' + csvFile)
    print(round(evaluateCategorization(wordVecBrand_method, csvFile) * 100,2) )

cn_word2vec-battig.csv


  return_n_iter=True)


60.19
cn_glove-battig.csv
67.63
cn_word2vec-ap.csv
89.31
cn_glove-ap.csv
90.95
cn_word2vec-essli-2008.csv
100.0
cn_glove-essli-2008.csv
100.0
