In [27]:
!pip install researchpy

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import gensim
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import csv
from csv import reader
from scipy import spatial
import functools

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

import imblearn
from imblearn.under_sampling import RandomUnderSampler

import researchpy as rp
import scipy.stats as stats
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:

        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session~

# **Importing & Processing Retrofitted vectors**

**Reading retrofitted out-vector file**

In [28]:
%%time
retrofittedVectorPath = '/kaggle/input/retrofittedvectors/out_vec_file.txt'
retrofittingFactor = 'party'

#retrofittedVectorPath = '/kaggle/input/retrofittedpartytimevectors/retrofittedPartyTimeVectors.txt'
#retrofittingFactor = 'party-time'

with open(retrofittedVectorPath) as f:

    vecs=[]
    vec=''

    while True:
        line = f.readline()
        if not line: 
            break        
        if(str(list(line)[0]).isalpha()):
            vec=vec.strip()
            if(vec!=''):
                vecs.append(vec)
            vec = line
        else:
            vec+=line
        
vecs = [vec.replace('\n', '')for vec in vecs]
print(str(len(vecs))+' Retrofitted vectors obtained')

In [29]:
change = ['exiting', 'seaborne', 'eurotunnel', 'withdrawal', 'departures', 'unicorn', 'remainers', 'exit', 'surrender',
          'departure', 'triggering', 'stockpiling', 'expulsion', 'blindfold', 'cliff', 'lighter', 'exits', 'triggered',
          'brexiteer', 'soft', 'plus', 'trigger', 'backroom', 'invoked', 'protesting', 'brexit', 'edge', 'canary', 
          'unicorns', 'withdrawing', 'invoking', 'withdrawn', 'manor', 'brexiteers', 'fanatics', 'postponement', 
          'currencies', 'currency', 'operability', 'operable', 'leavers', 'invoke', 'article', 'eurozone', 'clueless',
          'surrendered', 'cake', 'red', 'euroscepticism', 'prorogation', 'lining', 'gove', 'norway', 'deflationary',
          'moribund', 'eurosceptic', 'deutschmark', 'courting', 'deal', 'withdraw', 'dab', 'withdrawals', 'eurosceptics',
          'surrendering', 'aldous', 'lanarkshire', 'leaving', 'signifying', 'roofs', 'ceded', 'absentia', 'treachery',
          'dollar', 'canada', 'pragmatist', 'oven', 'ready', 'brexiters', 'control', 'capitulation', 'leave', 'referendum',
          'agreement', 'prorogue', 'smoothest', 'depreciate', 'managed', 'mutiny', 'overvalued', 'ideologues', 'foreign',
          'eec', 'war', 'prorogued', 'hannan', 'appease', 'pendolino', 'southbound', 'left', 'line', 'hard', 'bill']
 
no_change = ['prime', 'even', 'parliament', 'care', 'well', 'constituency', 'tax', 'children',
             'business', 'report', 'case', 'sure', 'like', 'see', 'state', 'order', 'back', 'new', 'hope', 'local',
             'secretary', 'public', 'right', 'much', 'say', 'first', 'minister', 'look', 'system', 'whether', 
             'members', 'million', 'good', 'today', 'services', 'clear', 'help', 'time', 'place', 'put', 'last', 'must', 'money', 'one', 
             'way', 'work', 'would', 'think', 'two', 'great', 'could', 'lady', 'us', 'come', 'however', 'may', 'going', 'go',
             'given', 'year', 'might', 'part', 'get', 'make', 'point', 'committee', 'years', 'also', 'know',
             'government', 'take', 'house', 'agree', 'member', 'number', 'across', 'made', 'give', 'gentleman', 'important', 'said',
             'people', 'issue', 'support', 'ensure']

words_of_interest= change+no_change

**Extracting vectors & mapping to synonym key, checking for dimensions**

In [30]:
dictKeyVector = {}
count=0
for i in range(len(vecs)):
    
    vec = vecs[i].strip().split(' ')
    # Extracting synonym key
    synKey = vec[0]
    del(vec[0])
    vec=[i for i in vec if i!='']
    
    if(len(vec)!=300):
        print('Vector with dimension<300', synKey,len(vec))
        count=count+1
    else:
        vec =[float(v) for v in vec]
        dictKeyVector[synKey]=vec
        npVec = np.array(dictKeyVector[synKey])
print('Count of vectors with fewer dimensions that we will not consider',count)
dfRetrofitted = pd.DataFrame({'vectorKey':list(dictKeyVector.keys()), 'vectors':list(dictKeyVector.values())})
dfRetrofitted.head()


In [31]:
dfRetrofitted.shape

**For party based retrofitting
2071 retrofitted vectors were expected. 
2070 were created. 
55 vectors discarded that had dimensions<300
2015 vectors left** 


**For party-time retrofitting - From 1962 input vectors, 1634 retrofitted vectors were received(328 were lost during retrofitting, no reason found yet), 
Further, 35 vectors have been discarded as the vector dimensions were lost (under 300)
Eventually left with 1599**

# **Calculating Cosine similarity**

In [32]:
# Filtering down words of interest as per those present in our vectors 
# We're amending the computeAvgVec function accordingly
# As it calculated based on processing from models, and here we're only taking vectors. Hence this check here too.

vectorKeys =list(dfRetrofitted['vectorKey'])
# Extracting words from vectors keys
words_of_interest = list(set([vk.split('-')[0] for vk in vectorKeys]))
print(words_of_interest, len(words_of_interest))

# NOW WE ONLY HAVE THOSE WORDS HERE WHICH ARE PRESENT IN THE VECTORS. 

**Functions for cosine similarity computation and to compute the average vector amongst many vectors for a given word**

In [33]:
# Different from the avg computation function in our other scripts. This works upon vectors instead of models
def computeAvgVec(mKeys, dicto, w, layerSize=300):
    modelsSum = np.zeros(layerSize)
    for k in mKeys:
        vectorPerModel = dicto[k]
        modelsSum = np.add(modelsSum, vectorPerModel)
    avgEmbedding =np.divide(modelsSum, len(mKeys))
    return avgEmbedding

def cosine_similarity(vec1, vec2):
  sc = 1-spatial.distance.cosine(vec1, vec2)
  return sc

cosine_similarity_df = pd.DataFrame(columns = ('Word', 'Cosine_similarity'))

**Compute cosine similarity between avg T1 and T2 vectors**

In [34]:
%%time

t1Keys = [t for t in list(dictKeyVector.keys()) if 't1' in t]
t2Keys = [t for t in list(dictKeyVector.keys()) if 't2' in t]
sims= []

# Compute average of word in T1 and in T2 and store average vectors and cosine difference   
for word in words_of_interest:
        
        #Provide a list of keys to average computation model for it to
        #compute average vector amongst these models
        wordT1Keys = [k for k in t1Keys if k.split('-')[0]==word]
        wordT2Keys = [k for k in t2Keys if k.split('-')[0]==word]
        
        #Since here the key itself contains the word we're not simply sending T1 keys but sending word-wise key
        avgVecT1 = computeAvgVec(wordT1Keys, dictKeyVector, word)
        avgVecT2 = computeAvgVec(wordT2Keys, dictKeyVector, word)
        
        if(avgVecT1.shape == avgVecT2.shape):
            # Cos similarity between averages
            cosSimilarity = cosine_similarity(avgVecT1, avgVecT2)
            sims.append(cosSimilarity)
        else:
            print('Word not found')
cosine_similarity_df['Word']=words_of_interest
cosine_similarity_df['Cosine_similarity']=sims

'''
cosine_similarity_df_sorted = cosine_similarity_df.sort_values(by='Cosine_similarity', ascending=True)
cosine_similarity_df_sorted'''

#Assigning change and no-change labels as initially expected
cosine_similarity_df['semanticDifference']=['default' for i in range(cosine_similarity_df.shape[0])]
cosine_similarity_df.loc[cosine_similarity_df['Word'].isin(change), 'semanticDifference'] = 'change' 
cosine_similarity_df.loc[cosine_similarity_df['Word'].isin(no_change), 'semanticDifference'] = 'no_change' 
cosine_similarity_df.sort_values(by='Cosine_similarity').head(10)

# **LOGISTIC REGRESSION**

**Evaluate the performance of retrofitted vectors**

In [35]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


In [36]:
X = cosine_similarity_df['Cosine_similarity'].values.reshape(-1,1)
y = cosine_similarity_df['semanticDifference']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

undersample = RandomUnderSampler(sampling_strategy=1.0)

X_over, y_over = undersample.fit_resample(X, y)
X=X_over
y=y_over

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

logreg = LogisticRegression()
kf = logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print('Y value counts',y.value_counts(),'\n')
print('Y train value counts', y_train.value_counts())

In [37]:
#scoring = {}
print(accuracy_score)
#if (retrofittingFactor=='party'):
    
scoring = {'accuracy' : make_scorer(accuracy_score), 
               'precision' : make_scorer(precision_score,pos_label='change'),
               'recall' : make_scorer(recall_score,pos_label='change'), 
               'f1_score' : make_scorer(f1_score,pos_label='change')}

scores = cross_validate(kf, X, y, cv=10, scoring=scoring,error_score='raise')

print('Accuracy', scores['test_accuracy'].mean())
print('Precision', scores['test_precision'].mean())
print('Recall', scores['test_recall'].mean())
print('F1 Score', scores['test_f1_score'].mean())


In [38]:
cf_matrix = confusion_matrix(y_test, y_pred)

ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')

ax.set_title('Confusion Matrix for vectors retrofitted on the basis of same party \n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

ax.xaxis.set_ticklabels(['change','no_change'])
ax.yaxis.set_ticklabels(['change','no_change'])

plt.show()

In [39]:
# T Test 

summary, results = rp.ttest(group1= cosine_similarity_df['Cosine_similarity'][cosine_similarity_df['semanticDifference'] == 'change'], group1_name= "change",
                            group2= cosine_similarity_df['Cosine_similarity'][cosine_similarity_df['semanticDifference'] == 'no_change'], group2_name= "no_change")
print(summary)

In [40]:
accuracy, precision, recall, f1_score = [], [], [], []
retrofittingBasis = [retrofittingFactor]

In [41]:
#scoresDf = pd.DataFrame(columns= ['retrofittingBasis','Accuracy','Precision','Recall','F1Score'])
#scoresDf.append(['party', scores['test_accuracy'].mean(), scores['test_precision'].mean(),scores['test_recall'].mean(), scores['test_f1_score'].mean()],axis=1)
accuracy.append(scores['test_accuracy'].mean())
precision.append(scores['test_precision'].mean())
recall.append(scores['test_recall'].mean())
f1_score.append(scores['test_f1_score'].mean())

In [124]:
'''scoresDict = {'retrofittingBasis': 'party','Accuracy':[scores['test_accuracy'].mean()],'Precision':scores['test_precision'].mean(),'Recall':scores['test_recall'].mean(),'F1Score':scores['test_f1_score'].mean()}
scoresDf = pd.DataFrame(scoresDict)

pd.concat([scoresDf,np.series(['party', scores['test_accuracy'].mean(), scores['test_precision'].mean(),scores['test_recall'].mean(), scores['test_f1_score'].mean()]) ])'''

In [42]:
scoresDict = {'retrofittingBasis': retrofittingBasis,'Accuracy':accuracy,'Precision':precision,'Recall':recall,'F1Score':f1_score}
if(retrofittingFactor=='party'):
    scoresDf = pd.DataFrame(scoresDict)
else:
    scoresDf = pd.concat([scoresDf, pd.DataFrame(scoresDict)])
scoresDf

In [43]:
#Just in case
#scoresDf.to_pickle('./partyScore.pkl')