In [18]:
!pip install researchpy

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import gensim
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import csv
from csv import reader
from scipy import spatial
import functools

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

import imblearn
from imblearn.under_sampling import RandomUnderSampler

import researchpy as rp
import scipy.stats as stats
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:

        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session~

In [8]:
%%time 
# Load aligned 24 Word2Vec models of MPs in T1 & T2

dictOfModels = {}
folderPath = '/kaggle/input/aligned24mptimemodels/kaggle/working/24-aligned-models-by-mp-and-time'

#Loading aligned models 
for file in os.listdir(folderPath):
    filePath = folderPath + '/' + file
    model = gensim.models.Word2Vec.load(filePath)
    dictOfModels[file] = model

In [9]:
change = ['exiting', 'seaborne', 'eurotunnel', 'withdrawal', 'departures', 'unicorn', 'remainers', 'exit', 'surrender',
          'departure', 'triggering', 'stockpiling', 'expulsion', 'blindfold', 'cliff', 'lighter', 'exits', 'triggered',
          'brexiteer', 'soft', 'plus', 'trigger', 'backroom', 'invoked', 'protesting', 'brexit', 'edge', 'canary', 
          'unicorns', 'withdrawing', 'invoking', 'withdrawn', 'manor', 'brexiteers', 'fanatics', 'postponement', 
          'currencies', 'currency', 'operability', 'operable', 'leavers', 'invoke', 'article', 'eurozone', 'clueless',
          'surrendered', 'cake', 'red', 'euroscepticism', 'prorogation', 'lining', 'gove', 'norway', 'deflationary',
          'moribund', 'eurosceptic', 'deutschmark', 'courting', 'deal', 'withdraw', 'dab', 'withdrawals', 'eurosceptics',
          'surrendering', 'aldous', 'lanarkshire', 'leaving', 'signifying', 'roofs', 'ceded', 'absentia', 'treachery',
          'dollar', 'canada', 'pragmatist', 'oven', 'ready', 'brexiters', 'control', 'capitulation', 'leave', 'referendum',
          'agreement', 'prorogue', 'smoothest', 'depreciate', 'managed', 'mutiny', 'overvalued', 'ideologues', 'foreign',
          'eec', 'war', 'prorogued', 'hannan', 'appease', 'pendolino', 'southbound', 'left', 'line', 'hard', 'bill']
 
no_change = ['prime', 'even', 'parliament', 'care', 'well', 'constituency', 'tax', 'children',
             'business', 'report', 'case', 'sure', 'like', 'see', 'state', 'order', 'back', 'new', 'hope', 'local',
             'secretary', 'public', 'right', 'much', 'say', 'first', 'minister', 'look', 'system', 'whether', 
             'members', 'million', 'good', 'today', 'services', 'clear', 'help', 'time', 'place', 'put', 'last', 'must', 'money', 'one', 
             'way', 'work', 'would', 'think', 'two', 'great', 'could', 'lady', 'us', 'come', 'however', 'may', 'going', 'go',
             'given', 'year', 'might', 'part', 'get', 'make', 'point', 'committee', 'years', 'also', 'know',
             'government', 'take', 'house', 'agree', 'member', 'number', 'across', 'made', 'give', 'gentleman', 'important', 'said',
             'people', 'issue', 'support', 'ensure']

words_of_interest= change+no_change

In [10]:
# Slightly modified to now find the cosine difference between provided vectors instead of
# fetching vectors from known models 
def cosine_similarity(vec1, vec2):
  sc = 1-spatial.distance.cosine(vec1, vec2)
  return sc

cosine_similarity_df = pd.DataFrame(columns = ('Word', 'Cosine_similarity'))

In [11]:
def computeAvgVec(mKeys, w):
    if(w in dictOfModels[mKeys[0]].wv.index_to_key):
        modelsSum = np.zeros(dictOfModels[mKeys[0]].layer1_size)
        for k in mKeys:
            vectorPerModel = dictOfModels[k].wv[w]
            modelsSum = np.add(modelsSum, vectorPerModel)
        avgEmbedding =np.divide(modelsSum, len(mKeys))
        return avgEmbedding
    else:
        print('Word '+str(w) + ' not found in models vocab')
        return []

In [12]:
words_of_interest = change + no_change

t1Keys = [k for k in dictOfModels.keys() if 'df_t1' in k] 
t2Keys = [k for k in dictOfModels.keys() if 'df_t2' in k] 

In [13]:
# Compute average of word in T1 and in T2 and store average vectors and cosine difference   
for word in words_of_interest:
        print(word)
        #Provide a list of keys to average computation model for it to
        #compute average vector amongst these models
        
        avgVecT1 = computeAvgVec(t1Keys, word)
        avgVecT2 = computeAvgVec(t2Keys, word)
        
        if(avgVecT1==[] or avgVecT2==[]):
            
            print(str(word) + ' Word not found')
            continue
            
        else:
            
            # Cos similarity between averages
            cosSimilarity = cosine_similarity(avgVecT1, avgVecT2)
            cosine_similarity_df = cosine_similarity_df.append({'Word': word, 'Cosine_similarity': cosSimilarity}, ignore_index=True)
           


In [14]:
words_of_interest = cosine_similarity_df[cosine_similarity_df['Word'].isin(change+no_change)]

words_of_interest.loc[words_of_interest['Word'].isin(change), 'semanticDifference'] = 'change' 
words_of_interest.loc[words_of_interest['Word'].isin(no_change), 'semanticDifference'] = 'no_change' 

words_of_interest.sort_values(by='Cosine_similarity')

# **LOGISTIC REGRESSION**

In [15]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


In [19]:
X = words_of_interest['Cosine_similarity'].values.reshape(-1,1)
y = words_of_interest['semanticDifference']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

undersample = RandomUnderSampler(sampling_strategy=1.0)

X_over, y_over = undersample.fit_resample(X, y)
X=X_over
y=y_over

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

logreg = LogisticRegression()
kf = logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print('Y value counts',y.value_counts(),'\n')
print('Y train value counts', y_train.value_counts())

In [20]:
scoring = {'accuracy' : make_scorer(accuracy_score), 
               'precision' : make_scorer(precision_score,pos_label='change'),
               'recall' : make_scorer(recall_score,pos_label='change'), 
               'f1_score' : make_scorer(f1_score,pos_label='change')}

scores = cross_validate(kf, X, y, cv=10, scoring=scoring,error_score='raise')

print('Accuracy', scores['test_accuracy'].mean())
print('Precision', scores['test_precision'].mean())
print('Recall', scores['test_recall'].mean())
print('F1 Score', scores['test_f1_score'].mean())


In [21]:
cf_matrix = confusion_matrix(y_test, y_pred)

ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')

ax.set_title('Confusion Matrix for vectors retrofitted on the basis of same party \n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

ax.xaxis.set_ticklabels(['change','no_change'])
ax.yaxis.set_ticklabels(['change','no_change'])

plt.show()

In [24]:
'''# T Test 

summary, results = rp.ttest(group1= words_of_interest['Cosine_similarity'][cosine_similarity_df['semanticDifference'] == 'change'], group1_name= "change",
                            group2= words_of_interest['Cosine_similarity'][cosine_similarity_df['semanticDifference'] == 'no_change'], group2_name= "no_change")
print(summary)'''

In [26]:
accuracy, precision, recall, f1_score = [], [], [], []
Basis = ['MP-Time - Split by same MP and same Time interval']

accuracy.append(scores['test_accuracy'].mean())
precision.append(scores['test_precision'].mean())
recall.append(scores['test_recall'].mean())
f1_score.append(scores['test_f1_score'].mean())

In [29]:
scoresDict = {'Model': ['Baseline Model']'Basis': Basis,'Accuracy':accuracy,'Precision':precision,'Recall':recall,'F1Score':f1_score}

scoresDf = pd.DataFrame(scoresDict)
scoresDf