In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import gensim
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import csv
from csv import reader
from scipy import spatial

import functools
from collections import Counter
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

# Input data files are available in the read-only "../input/" directoryf
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """
    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))
    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        if(len(m.wv.key_to_index)==7 | len(m.wv.key_to_index)==3 | len(m.wv.key_to_index)==1):
            print('Common vocab', common_vocab)
    return (m1,m2)

In [3]:
# Function to align two spaces with orthogunal procrustes
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    if(len(in_base_embed.wv.key_to_index)!=0):
        # re-filling the normed vectors
        in_base_embed.wv.fill_norms(force=True)
        in_other_embed.wv.fill_norms(force=True)
    
    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    
    other_vecs = in_other_embed.wv.get_normed_vectors()
    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 

    # SVD method from numpy
    u, _, v = np.linalg.svd(m)

    # another matrix operation
    ortho = u.dot(v) 

    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    return other_embed

In [4]:
%%time

# Load 680 Word2Vec models of MPs in T1 & T2

MpTimeDict = {}
folderPaths = ['/kaggle/input/modelsbympandtime-680/1_ModelsByMpTime/kaggle/working/models-by-mp-and-time/', '/kaggle/input/modelsbympandtime-680/2_ModelsByMpTime/kaggle/working/models-by-mp-and-time/']
#folderPath = '/kaggle/input/mptimemodels-1/kaggle/working/models-by-mp-and-time'
for folderPath in folderPaths:
    for file in os.listdir(folderPath):
        filePath = folderPath + '/' + file
        model = gensim.models.Word2Vec.load(filePath)
        MpTimeDict[file] = model

In [5]:
len(MpTimeDict.keys())

In [7]:
%%time
# This piece of code took 5 hours to run 
modelsToAlign = list(MpTimeDict.values())
for i in range(0,len(modelsToAlign)-1):
    functools.reduce(smart_procrustes_align_gensim, modelsToAlign)

In [14]:
# Code to check if models are aligned 
%%time
count = 0 
for j, i in enumerate(modelsToAlign):
    if(count%100==0):
        print('count', count)
    if(len(i.wv.index_to_key)!=135):
        print(count, len(i.wv.index_to_key))
    if(list(i.wv.index_to_key).sort()!=list(modelsToAlign[j-1].wv.index_to_key).sort()):
        print('Not equal',count)
    count +=1

In [17]:
os.makedirs('./aligned-models-by-mp-and-time')

In [18]:
%%time
# Next step is to save aligned models 

aligned_models_folder = './aligned-models-by-mp-and-time'

for c, mod in enumerate(modelsToAlign):
    if(c%50==0):
        print('Saving model number', c+1)
    modelName = 'aligned_model_' + str(c+1)
    mod.save(os.path.join(aligned_models_folder, modelName))

In [20]:
%%time
#  ------- Final list of models saved after alignment ---------

# ------------ new cell --------
!zip -r file.zip /kaggle/working/aligned-models-by-mp-and-time


In [6]:
%%time
#Align models

functools.reduce(smart_procrustes_align_gensim, list(MpTimeDict.values()))

In [7]:
count = 0
for model in MpTimeDict.values():
    if (count<2):
        #print(model.wv.index_to_key)
        print(model.wv.most_similar('brexit'))
        print(' -x -x -x -x -x -x -x -x -x -x -x -x -x')
        '''print(len(model.wv.index_to_key))
        counterVocab = Counter(model.wv.index_to_key)
        print(len(counterVocab))'''
    else:
        break
    count = count+1
        

In [8]:
def cosine_similarity(word):
  sc = 1-spatial.distance.cosine(model1.wv[word], model2.wv[word])
  return sc

In [9]:
model1 = list(MpTimeDict.values())[0]
model2 = list(MpTimeDict.values())[1]

In [10]:
smart_procrustes_align_gensim(model1, model2, words=None)

In [14]:
cosine_similarity_df = pd.DataFrame(([w, cosine_similarity(w), model1.wv.get_vecattr(w, "count") , model2.wv.get_vecattr(w, "count") ] for w in model1.wv.index_to_key), columns = ('Word', 'Cosine_similarity', "Frequency_t1", "Frequency_t2"))
cosine_similarity_df

In [102]:
m1 = list(MpTimeDict.values())[0]
m2 = list(MpTimeDict.values())[1]
m3 = list(MpTimeDict.values())[2]
m4 = list(MpTimeDict.values())[3]
m5 = list(MpTimeDict.values())[4]
l1 = [m1,m2,m3,m4,m5]

#Before alignment
for m in l1:
    print (len(m.wv.index_to_key))

In [112]:
%%time
functools.reduce(smart_procrustes_align_gensim, l1)

In [107]:
print ('Lengths of model vocabs now ', len(l1[0].wv.index_to_key), len(l1[1].wv.index_to_key), len(l1[2].wv.index_to_key), len(l1[3].wv.index_to_key), len(l1[4].wv.index_to_key))
print(list(l1[2].wv.index_to_key).sort() == list(l1[1].wv.index_to_key).sort()==list(l1[3].wv.index_to_key).sort()==list(l1[0].wv.index_to_key).sort()==list(l1[4].wv.index_to_key).sort())


In [109]:
print ('Lengths of model vocabs now ', len(l1[0].wv.index_to_key), len(l1[1].wv.index_to_key), len(l1[2].wv.index_to_key), len(l1[3].wv.index_to_key), len(l1[4].wv.index_to_key))
print(list(l1[2].wv.index_to_key).sort() == list(l1[1].wv.index_to_key).sort()==list(l1[3].wv.index_to_key).sort()==list(l1[0].wv.index_to_key).sort()==list(l1[4].wv.index_to_key).sort())


In [111]:
print ('Lengths of model vocabs now ', len(l1[0].wv.index_to_key), len(l1[1].wv.index_to_key), len(l1[2].wv.index_to_key), len(l1[3].wv.index_to_key), len(l1[4].wv.index_to_key))
print(list(l1[2].wv.index_to_key).sort() == list(l1[1].wv.index_to_key).sort()==list(l1[3].wv.index_to_key).sort()==list(l1[0].wv.index_to_key).sort()==list(l1[4].wv.index_to_key).sort())


In [113]:
print ('Lengths of model vocabs now ', len(l1[0].wv.index_to_key), len(l1[1].wv.index_to_key), len(l1[2].wv.index_to_key), len(l1[3].wv.index_to_key), len(l1[4].wv.index_to_key))
print(list(l1[2].wv.index_to_key).sort() == list(l1[1].wv.index_to_key).sort()==list(l1[3].wv.index_to_key).sort()==list(l1[0].wv.index_to_key).sort()==list(l1[4].wv.index_to_key).sort())


In [97]:
%%time

# alignment 1
# Works to align models 

for i in (0,len(l1)-1):
    for count,m in enumerate(l1):
        if(count< len(l1)-1):
            smart_procrustes_align_gensim(m,l1[count+1])
            print ('Lengths of model vocabs now ' + 'm' + str(count+1) + ' ' + str(len(m.wv.index_to_key)) + ' m' + str(count+2) + ' ' + str(len(l1[count+1].wv.index_to_key)))
        else:
            print('On last so doing nothing')


In [100]:
list(l1[2].wv.index_to_key).sort() == list(l1[1].wv.index_to_key).sort()==list(l1[3].wv.index_to_key).sort()==list(l1[0].wv.index_to_key).sort()==list(l1[4].wv.index_to_key).sort()

In [85]:
# alignment 1
for count,m in enumerate(l1):
    if(count< len(l1)-1):
        smart_procrustes_align_gensim(m,l1[count+1])
        print ('Lengths of model vocabs now ' + 'm' + str(count+1) + ' ' + str(len(m.wv.index_to_key)) + ' m' + str(count+2) + ' ' + str(len(l1[count+1].wv.index_to_key)))
    else:
        print('On last so doing nothing')


In [86]:
#After alignment
for m in l1:
    print (len(m.wv.index_to_key))

In [25]:
def align_years(years):
    first_iter = True
    base_embed = None
    for year_embed in years:
        
        #print ("Aligning year:", #year)
        
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = smart_procrustes_align_gensim(base_embed, year_embed)
            
        base_embed = aligned_embed
        #print ("Writing year:", year)
        print(len(base_embed.wv.index_to_key),len(aligned_embed.wv.index_to_key))
        #foutname = out_dir + str(year)
        #np.save(foutname + "-w.npy",aligned_embed.m)
        #write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")

In [36]:
align_years(l1)

In [38]:
print('Are the index to keys the same', (m1.wv.index_to_key==m2.wv.index_to_key),(m2.wv.index_to_key==m3.wv.index_to_key),(m3.wv.index_to_key==m4.wv.index_to_key),(m4.wv.index_to_key==m5.wv.index_to_key))

In [15]:
print('Lengths of models before any alignment', len(m1.wv.index_to_key),len(m2.wv.index_to_key),len(m3.wv.index_to_key),len(m4.wv.index_to_key))

In [43]:
#Experiment 1
rand = smart_procrustes_align_gensim(m1,m2,words=None)

In [47]:
#Experiment 1
print('Lengths of m1,m2 and rand after alignment', len(m1.wv.index_to_key),len(m2.wv.index_to_key),len(rand.wv.index_to_key))
print('Are the index to keys the same', (m1.wv.index_to_key==m2.wv.index_to_key),(m2.wv.index_to_key==rand.wv.index_to_key))

In [48]:
#Experiment 1
dand = smart_procrustes_align_gensim(m2,m3,words=None)

In [49]:
#Experiment 1
print('Lengths of m2,m3 and dand after alignment', len(m2.wv.index_to_key),len(m3.wv.index_to_key),len(dand.wv.index_to_key))
print('Are the index to keys the same', (m2.wv.index_to_key==m3.wv.index_to_key),(m3.wv.index_to_key==dand.wv.index_to_key))

In [50]:
#Experiment 1
nand = smart_procrustes_align_gensim(m1,m3,words=None)

In [52]:
#Experiment 1
print('Lengths of m2,m3 and dand after alignment', len(m1.wv.index_to_key),len(m2.wv.index_to_key),len(m3.wv.index_to_key))
print('Are the index to keys the same', (m1.wv.index_to_key==m2.wv.index_to_key),(m2.wv.index_to_key==m3.wv.index_to_key),(m1.wv.index_to_key==m3.wv.index_to_key))

In [12]:
# Check if models have been aligned! 
#They should have the same vocab and length of vocab oc

list(MpTimeDict.values())[0].wv.index_to_key == list(MpTimeDict.values())[1].wv.index_to_key
#model1.wv.index_to_key == model2.wv.index_to_key


In [11]:
list(MpTimeDict.values())[0]

In [16]:
m1 = list(MpTimeDict.values())[0]
m2 = list(MpTimeDict.values())[1]
m3 = list(MpTimeDict.values())[2]
m4 = list(MpTimeDict.values())[3]
m5 = list(MpTimeDict.values())[4]

In [14]:
smart_procrustes_align_gensim(m1,m2,words=None)

In [35]:
m1.wv.index_to_key == list(MpTimeDict.values())[0].wv.index_to_key


In [33]:
m1.wv.index_to_key == m4.wv.index_to_key


In [30]:
smart_procrustes_align_gensim(m1,m4,words=None)