In [25]:
import numpy as np
import pandas as pd 
import pickle
import gensim
import csv
#from csv import reader

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Loading Synonyms and 24 aligned models**

In [26]:
synonymsPath = '/kaggle/input/retrofitting-synonyms/synonymsPartyTime.pkl'
with open(synonymsPath,'rb') as f:
        synonyms = pickle.load(f)
print('The length of synonym-pairs is', len(synonyms))

In [27]:
%%time
# Load 24 Word2Vec models of MPs in T1 & T2

dictOfModels = {}
folderPath = '/kaggle/input/aligned24mptimemodels/kaggle/working/24-aligned-models-by-mp-and-time'

for file in os.listdir(folderPath):
    filePath = folderPath + '/' + file

    #To accommodate errors while picking up corresponding numpy files of gensim models
    if(len(filePath.split('.'))>1):
        continue
    else:
        model = gensim.models.Word2Vec.load(filePath)
        dictOfModels[file] = model


# Extract time, party, MP ID information from name and store in Dataframes
brexitEmbeddings = {}

for k in dictOfModels.keys():
            
    time = k.split('df_')[1].split('_')[0]
    mpId = k.split('df_')[1].split('_')[1]
    party = k.split('df_')[1].split('_')[2]
    
    brexEmbDf =pd.DataFrame()
    brexEmbDf['model'] = [dictOfModels[k]]
    brexEmbDf['modelKey'] = k
    brexEmbDf['time'] = time
    brexEmbDf['mpId'] = mpId
    brexEmbDf['party'] = party

    brexitEmbeddings[k] = brexEmbDf
    

brexitDf = pd.DataFrame()

for v in brexitEmbeddings.values():
    brexitDf = brexitDf.append(v)

brexitDf.reset_index(inplace=True)
brexitDf

In [28]:
change = ['exiting', 'seaborne', 'eurotunnel', 'withdrawal', 'departures', 'unicorn', 'remainers', 'exit', 'surrender',
          'departure', 'triggering', 'stockpiling', 'expulsion', 'blindfold', 'cliff', 'lighter', 'exits', 'triggered',
          'brexiteer', 'soft', 'plus', 'trigger', 'backroom', 'invoked', 'protesting', 'brexit', 'edge', 'canary', 
          'unicorns', 'withdrawing', 'invoking', 'withdrawn', 'manor', 'brexiteers', 'fanatics', 'postponement', 
          'currencies', 'currency', 'operability', 'operable', 'leavers', 'invoke', 'article', 'eurozone', 'clueless',
          'surrendered', 'cake', 'red', 'euroscepticism', 'prorogation', 'lining', 'gove', 'norway', 'deflationary',
          'moribund', 'eurosceptic', 'deutschmark', 'courting', 'deal', 'withdraw', 'dab', 'withdrawals', 'eurosceptics',
          'surrendering', 'aldous', 'lanarkshire', 'leaving', 'signifying', 'roofs', 'ceded', 'absentia', 'treachery',
          'dollar', 'canada', 'pragmatist', 'oven', 'ready', 'brexiters', 'control', 'capitulation', 'leave', 'referendum',
          'agreement', 'prorogue', 'smoothest', 'depreciate', 'managed', 'mutiny', 'overvalued', 'ideologues', 'foreign',
          'eec', 'war', 'prorogued', 'hannan', 'appease', 'pendolino', 'southbound', 'left', 'line', 'hard', 'bill']
 
no_change = ['prime', 'even', 'parliament', 'care', 'well', 'constituency', 'tax', 'children',
             'business', 'report', 'case', 'sure', 'like', 'see', 'state', 'order', 'back', 'new', 'hope', 'local',
             'secretary', 'public', 'right', 'much', 'say', 'first', 'minister', 'look', 'system', 'whether', 
             'members', 'million', 'good', 'today', 'services', 'clear', 'help', 'time', 'place', 'put', 'last', 'must', 'money', 'one', 
             'way', 'work', 'would', 'think', 'two', 'great', 'could', 'lady', 'us', 'come', 'however', 'may', 'going', 'go',
             'given', 'year', 'might', 'part', 'get', 'make', 'point', 'committee', 'years', 'also', 'know',
             'government', 'take', 'house', 'agree', 'member', 'number', 'across', 'made', 'give', 'gentleman', 'important', 'said',
             'people', 'issue', 'support', 'ensure']

words_of_interest= change+no_change

# **Combining MP-time DF with additional MP info from synonyms keys**

In [29]:
# Flatten synonyms for check
firstSyns = [tup[0] for tup in synonyms]
secondSyns = [tup[1] for tup in synonyms]
synonymsList = firstSyns+secondSyns
uniqueSynonymsList = set(synonymsList)

In [30]:
# Extract from synonym-key and add mp name info to the MP-time DF 
mpNames = []
for i in range(brexitDf.shape[0]):
    mpToSearch = brexitDf['mpId'].iat[i]
    # To ensure we don't match the likes of MP id 16 with MP id 216
    mpToSearch = '-'+mpToSearch+'-'
    mpName='dummy'
    for syn in uniqueSynonymsList:
        if(mpToSearch in syn):
            mpName = syn.split(mpToSearch)[1]
            break
    mpNames.append('default') if not mpName else mpNames.append(mpName)
brexitDf['mpNamePartyInfo'] = mpNames
brexitDf

# **Creating Input Vectors' file for retrofitting**

In [32]:
%%time
# Creating the input vector file for Faruqui's code
# Each vector is represented by its synonym key at the start followed by the word vector's 
# numpy array representation. Refer sample_vec.txt in Faruqui's code

def create_input_vectors():
    vectorFileName = 'vectorsPartyTime.txt'
    with open(vectorFileName,'w') as f:
        for w_ind, word in enumerate(words_of_interest):
            for i in range(brexitDf.shape[0]):
                if(brexitDf['mpNamePartyInfo'].iat[i]!='dummy'):
                    if(word in brexitDf['model'].iat[i].wv.index_to_key):
                        synonymString = word+'-'+brexitDf['time'].iat[i]+'-'+brexitDf['mpId'].iat[i]+'-'+brexitDf['mpNamePartyInfo'].iat[i]
                        wordVector = brexitDf['model'].iat[i].wv[word]
                        stringifiedVector = str(wordVector.flatten())
                        
                        #The numpy array contains array brackets at the start and end, 
                        #This is not the format as in Faruqui's input code, hence replace
                        stringifiedVector = stringifiedVector.replace('[','').replace(']','')

                        #Strangely the vectors that start with a negative floating point have no space written 
                        #between the synonym key and the vector dimensions. 
                        #So to check if the first dimension of the vector is <0 and if so, insert space before
                        stringVectorSplit = stringifiedVector.split()
                        if(stringVectorSplit[0]!=''):
                            if(float(stringVectorSplit[0])<0):
                                stringifiedVector = ' '+stringifiedVector

                        f.write(synonymString)
                        f.write(stringifiedVector)

                        #To prevent writing an extra line break at the end of the file
                        if(i==(brexitDf.shape[0]-1) and w_ind==len(words_of_interest)-1):
                            continue
                        else:
                            f.write('\n')
        return True


    
create_input_vectors()


# For Reference, the counts of parties' MPs in models - 
'''Conservative                        13
Scottish National Party              3
Labour                               3
Plaid Cymru                          1
Green Party                          1
Democratic Unionist Party            1
Social Democratic & Labour Party     1
Labour (Co-op)                       1
'''
# Hence, for party syns, MPs from only Conservative, Labour, Scottish National Party will have synonym records
# as the others don't have any potential synonym-pairs (So 5 MPs' records is not found in synonym-pairs)
