In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import csv
from csv import reader
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Load Mp-Time DF & initiate words of interest**

In [9]:
picklePath ='/kaggle/input/mptimedf/mpTimeDf.pkl'
mpTimeDf = pd.read_pickle(picklePath)
mpTimeDf.shape

In [4]:
change = ['exiting', 'seaborne', 'eurotunnel', 'withdrawal', 'departures', 'unicorn', 'remainers', 'exit', 'surrender',
          'departure', 'triggering', 'stockpiling', 'expulsion', 'blindfold', 'cliff', 'lighter', 'exits', 'triggered',
          'brexiteer', 'soft', 'plus', 'trigger', 'backroom', 'invoked', 'protesting', 'brexit', 'edge', 'canary', 
          'unicorns', 'withdrawing', 'invoking', 'withdrawn', 'manor', 'brexiteers', 'fanatics', 'postponement', 
          'currencies', 'currency', 'operability', 'operable', 'leavers', 'invoke', 'article', 'eurozone', 'clueless',
          'surrendered', 'cake', 'red', 'euroscepticism', 'prorogation', 'lining', 'gove', 'norway', 'deflationary',
          'moribund', 'eurosceptic', 'deutschmark', 'courting', 'deal', 'withdraw', 'dab', 'withdrawals', 'eurosceptics',
          'surrendering', 'aldous', 'lanarkshire', 'leaving', 'signifying', 'roofs', 'ceded', 'absentia', 'treachery',
          'dollar', 'canada', 'pragmatist', 'oven', 'ready', 'brexiters', 'control', 'capitulation', 'leave', 'referendum',
          'agreement', 'prorogue', 'smoothest', 'depreciate', 'managed', 'mutiny', 'overvalued', 'ideologues', 'foreign',
          'eec', 'war', 'prorogued', 'hannan', 'appease', 'pendolino', 'southbound', 'left', 'line', 'hard', 'bill']
 
no_change = ['prime', 'even', 'parliament', 'care', 'well', 'constituency', 'tax', 'children',
             'business', 'report', 'case', 'sure', 'like', 'see', 'state', 'order', 'back', 'new', 'hope', 'local',
             'secretary', 'public', 'right', 'much', 'say', 'first', 'minister', 'look', 'system', 'whether', 
             'members', 'million', 'good', 'today', 'services', 'clear', 'help', 'time', 'place', 'put', 'last', 'must', 'money', 'one', 
             'way', 'work', 'would', 'think', 'two', 'great', 'could', 'lady', 'us', 'come', 'however', 'may', 'going', 'go',
             'given', 'year', 'might', 'part', 'get', 'make', 'point', 'committee', 'years', 'also', 'know',
             'government', 'take', 'house', 'agree', 'member', 'number', 'across', 'made', 'give', 'gentleman', 'important', 'said',
             'people', 'issue', 'support', 'ensure']

words_of_interest= change+no_change

**Create synonym pairs**

In [5]:
%%time
# Synonym Creation Functions
def create_synonyms_party(mpTimeDf, word, factor):
    parties = list(mpTimeDf.party.value_counts().index)
    dictOfSynonyms={}
    
    # Iterate parties & create synonyms where more than one record for a party
    for p in parties:
        
        partySynonyms=[]
        partyDf = mpTimeDf[mpTimeDf['party']==p]
        mnis_ids=list(partyDf['mnis_id'])
        speaker_names=list(partyDf['speakerName'])
        
        times=list(partyDf['df_name'])
        times = [t.split('_')[1] for t in times]
        
        # To fix party names like 'Scottish National Party by inserting hyphens between
        if(len(p.split(' '))>1):
                    splat = p.split(' ')   
                    p = '-'.join(splat)
                    
        for ind, id in enumerate(mnis_ids):
            
            # Concatenating speaker first and last names with '-'    
            speaker_names_split = speaker_names[ind].split(' ')
            speaker_name=''
            for name in speaker_names_split:
                speaker_name =speaker_name+name+'-'
                
            #Creating synonym string or key 
            syn_str = word+'-'+times[ind]+'-'+id+'-'+speaker_name+p
            partySynonyms.append(syn_str)
            
        dictOfSynonyms[p]=partySynonyms
    #Making pairs 
    synonyms=[]
    for k in dictOfSynonyms.keys():
        word_mps_party = dictOfSynonyms[k]
        # Proceed to make pairs only if more than one record per party
        if(len(word_mps_party)>1):
            for i,rec in enumerate(word_mps_party):
                for j in range(i+1,len(word_mps_party)):
                    # --------------- IF MAKING PAIRS ON PARTY-TIME BASIS, THIS CODE IS THE DIFFERENTIATING BIT---
                    if(factor=='party-time'):
                        if(rec.split('-')[1]==word_mps_party[j].split('-')[1]):
                            syntup = (rec,word_mps_party[j])
                            synonyms.append(syntup)
                    else:
                        syntup = (rec,word_mps_party[j])
                        synonyms.append(syntup)

    return synonyms


# - - - - -MAIN CREATE SYNONYMS FUNCTION, CALLS OTHER SYNONYM-CREATING FUNCTIONS
def create_synonyms(mpTimeDf, word, factor):
    # For both party and party-time basis
    if(factor=='party' or factor=='party-time'):
        synonyms = create_synonyms_party(mpTimeDf, word, factor)
    return synonyms


In [8]:
# main
def _main_():
    
    synonymFactor = 'party'
    synPicklePath = 'synonymsParty.pkl'
    synTextPath = 'synonymsParty.txt'
    
    allSynonyms=[]
    for word in words_of_interest:
        synonymsPerWord = create_synonyms(mpTimeDf,word,synonymFactor)
        #print(len(synonyms)) #Verify length of synonyms
        allSynonyms.append(synonymsPerWord)
    #Here it is 84 , which is sum of combinations made 
    #for the three parties (13,3,3)=> no. of combinations is (78,3,3), 78+3+3= 84, hence verified. 

    brexitSynonyms = allSynonyms

    # We're capturing synonyms of all words of interest regardless of whether they're part of the models' vocab
    # Since the same synonyms-dictionary can be used for other models
    #print(len(words_of_interest),len(allSynonyms))

    allSynonyms = [tup for lst in brexitSynonyms for tup in lst]
    #print(len(allSynonyms)) 
    # For party factor alone =>Length should be 187*84=15708 OR len(words_of_interest)*len(mp-in-same-party pairs)
    # For party-time factor => Length should be 187*42=7854 OR len(w_of_int)*len(mp-in-same-party-same-time pairs)
    
    
    # Writing synonym files 
    # Change name for the pkl and txt files as per synonym-making factor, e.g. synonyms-party-time, etc

    with open(synPicklePath, 'wb') as f:
        pickle.dump(allSynonyms, f)


    with open( synTextPath,'w') as f:
        for tpl in allSynonyms:
            for mptime in tpl:
                f.write(mptime)
                f.write(' ')
            f.write('\n')

_main_()