In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import gensim
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import csv
from csv import reader
from scipy import spatial
import functools

from nltk.stem.snowball import SnowballStemmer
import spacy

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ****Getting lemmas and metadata together****

In [2]:
%%time
# open file in read mode
with open('/kaggle/input/utf8tokenizedspeeches/TokenizedSpeeches_utf-8.csv', 'r') as read_obj:
#with open('TokenizedSpeeches_utf-8.csv', 'r') as read_obj:

    lemmasList = []
    
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Iterate over each row in the csv using reader object
    for row in csv_reader: 
        lemmasList.append(row)
    print(len(lemmasList), 'Rows read')

In [3]:
%%time
#create dataframe from the lemmas extracted from csv
dictOfLemmas = {'Lemmas': lemmasList}
lemmasDf = pd.DataFrame(dictOfLemmas)
lemmasDf

In [4]:
with open('/kaggle/input/hansard-speeches-lemmatized/hansard-speeches-post2010.pkl', 'rb') as f:
    df = pickle.load(f)

In [5]:
#since index was missing values and didn't match with the lemmasDf index
df = df.reset_index(drop=True)
df = df.join(lemmasDf)
df['Lemmas']

# Dividing and training corpus before and after the Brexit referendum

In [6]:
# Split data based on the Brexit referendum event before and after period
eventDate = '2016-06-23 23:59:59'
df_t1 = df[df['date']<= eventDate]
df_t2 = df[df['date']> eventDate]

# Choosing intersecting vocabulary and Aligning models 

In [7]:
def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """
    print(1)
    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)
    print(2)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)
    print(3)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        if(len(m.wv.key_to_index)==135):
            print('Common vocab is', common_vocab)
        
    return (m1,m2)

In [8]:
# Function to align two spaces with orthogunal procrustes
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """
    print(4)

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)
    
    in_base_embed.wv.fill_norms(force=True)
    in_other_embed.wv.fill_norms(force=True)
        
    print(5)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

# Splitting speeches by Parties and time

In [9]:
parties_t1 = list(df_t1['party'].unique())
parties_t2 = list(df_t2['party'].unique())
parties = set(parties_t1+parties_t2)
len(parties)

In [10]:
%%time
# New code for dividing corpus by time and MPs 

 #   -x  -x  -x  Integrates all lemmas to map per MP - one vocab per MP per time period (eventually two vocabs per MP across T1 & T2)  -x  -x  -x  -x  -x

dictSpeechesByParty = {}

for p in parties:

    for dfTime in ['df_t1','df_t2']:

        tempDf = pd.DataFrame()
        tempList = []
        Lemmas =[]
        dfName = dfTime + '_'+ p

        if(dfTime == 'df_t1'):
            tempDf = df_t1[df_t1['party']==p]
            
        elif (dfTime == 'df_t2'):
            tempDf = df_t2[df_t2['party']==p] 

        if (tempDf.shape[0]==0):
            continue
            
        tempList.extend(tempDf['Lemmas'].to_list())
        party = tempDf['party'].iat[0]
    
        #Flatten the list so it's not a list of lists
        tempList = [item for sublist in tempList for item in sublist]
        
        tempDf = pd.DataFrame([[party, tempList]],columns=['party', 'Lemmas'])
        dictSpeechesByParty[dfName]= tempDf
        dictSpeechesByParty[dfName]['df_name'] = dfName


In [11]:
len(dictSpeechesByParty.keys())


In [12]:
dictSpeechesByParty.keys()

In [13]:
import spacy

In [13]:
%%time


# Use English stemmer.
stemmer = SnowballStemmer("english")
spacyMod = spacy.load('en_core_web_sm')
spacyMod.max_length = 1.39*10000000 


for pDf in dictSpeechesByParty.values():
    
    tokens =[token for token in pDf['Lemmas']][0]
    sent =' '.join(tokens)

    pDf['sentence'] =sent
    
    #Disabling RaM hungry parts not needed for lemmatization
    #lemmatizedSent = spacyMod(sent, disable = ['ner', 'parser'])
    #lemmatizedSent = [word.lemma_ for word in lemmatizedSent]
    print(len(tokens))

    pDf['stemmed'] = pDf['Lemmas'].apply(lambda x: [stemmer.stem(y) for y in x])
    #pDf['lemmatized'] = [lemmatizedSent]

#ValueError: [E088] Text of length 9266826 exceeds maximum of 1000000. 
#The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. 
#This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. 
#The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.
'''Done till here when both stemming and lemmatization done- 65156
116180
5340
156935
80876
10129
749124'''
# All stemming done and added, 9.8 GBs used

In [None]:
'''%%time


# Use English stemmer.
stemmer = SnowballStemmer("english")
spacyMod = spacy.load('en_core_web_sm')


for pDf in dictSpeechesByParty.values()
    sent =' '.join([token for token in pDf['Lemmas']][0])
    dictSpeechesByParty['df_t1_Respect']['sentence'] =sent

    lemmatizedSent = spacyMod(sent)
    lemmatizedSent = [word.lemma_ for word in lemmatizedSent]
    print(len(lemmatizedSent))

    dictSpeechesByParty['df_t1_Respect']['stemmed'] = dictSpeechesByParty['df_t1_Respect']['Lemmas'].apply(lambda x: [stemmer.stem(y) for y in x])
    dictSpeechesByParty['df_t1_Respect']['lemmatized'] = [lemmatizedSent]
'''

In [48]:
'''for i in dictSpeechesByParty['df_t1_Respect']['stemmed']:
    print (i)
    break'''

In [14]:
dictSpeechesByParty['df_t1_Respect']

In [20]:
%%time 
partyTimeDf = pd.DataFrame(columns = ['party', 'Lemmas', 'df_name', 'sentence', 'stems'])
for val in list(dictSpeechesByParty.values()):
    partyTimeDf = partyTimeDf.append(val)

partyTimeDf['LengthLemmas'] = partyTimeDf.Lemmas.map(len)
partyTimeDf.agg(Max=('LengthLemmas', max), Min=('LengthLemmas', 'min'), Mean=('LengthLemmas', np.mean))

In [21]:
partyTimeDf

In [24]:
change = ['exiting', 'seaborne', 'eurotunnel', 'withdrawal', 'departures', 'unicorn', 'remainers', 'exit', 'surrender',
          'departure', 'triggering', 'stockpiling', 'expulsion', 'blindfold', 'cliff', 'lighter', 'exits', 'triggered',
          'brexiteer', 'soft', 'plus', 'trigger', 'backroom', 'invoked', 'protesting', 'brexit', 'edge', 'canary', 
          'unicorns', 'withdrawing', 'invoking', 'withdrawn', 'manor', 'brexiteers', 'fanatics', 'postponement', 
          'currencies', 'currency', 'operability', 'operable', 'leavers', 'invoke', 'article', 'eurozone', 'clueless',
          'surrendered', 'cake', 'red', 'euroscepticism', 'prorogation', 'lining', 'gove', 'norway', 'deflationary',
          'moribund', 'eurosceptic', 'deutschmark', 'courting', 'deal', 'withdraw', 'dab', 'withdrawals', 'eurosceptics',
          'surrendering', 'aldous', 'lanarkshire', 'leaving', 'signifying', 'roofs', 'ceded', 'absentia', 'treachery',
          'dollar', 'canada', 'pragmatist', 'oven', 'ready', 'brexiters', 'control', 'capitulation', 'leave', 'referendum',
          'agreement', 'prorogue', 'smoothest', 'depreciate', 'managed', 'mutiny', 'overvalued', 'ideologues', 'foreign',
          'eec', 'war', 'prorogued', 'hannan', 'appease', 'pendolino', 'southbound', 'left', 'line', 'hard', 'bill']
 
    
#from nltk.stem import WordNetLemmatizer
 #assumes as nouns
 #pos tagging, mapping needed 

no_change = ['prime', 'even', 'parliament', 'mr', 'eu', 'bill', 'future', 'care', 'well', 'constituency', 'tax', 'children', 'uk',
             'business', 'european', 'report', 'case', 'sure', 'like', 'see', 'state', 'health', 'order', 'back', 'new', 'hope', 'local',
             'national', 'country', 'secretary', 'public', 'right', 'much', 'say', 'first', 'minister', 'labour', 'look', 'system', 'whether', 
             'members', 'million', 'good', 'today', 'services', 'clear', 'help', 'time', 'place', 'put', 'last', 'must', 'money', 'one', 
             'way', 'friend', 'work', 'would', 'think', 'two', 'great', 'could', 'lady', 'us', 'come', 'however', 'may', 'going', 'go',
             'given', 'need', 'year', 'debate', 'might', 'part', 'get', 'want', 'make', 'point', 'committee', 'years', 'also', 'know',
             'government', 'take', 'house', 'agree', 'member', 'number', 'across', 'made', 'give', 'gentleman', 'important', 'said',
             'people', 'issue', 'support', 'ensure']

stemmer = SnowballStemmer("english")
changeStem= [stemmer.stem(word) for word in change]
noChangeStem= [stemmer.stem(word) for word in no_change]


In [22]:
partyTimeDf = partyTimeDf.reset_index()

In [23]:
partyTimeDf.shape

# **STEMMING & LEMMATIZATION**

In [17]:
'''%%time

# Use English stemmer.
stemmer = SnowballStemmer("english")
#lemmaz = [token.lemma_.lower() for token in doc if token.is_alpha and token.text.lower() not in stop_words]
partyTimeDf['stemmed'] = partyTimeDf['Lemmas'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
for '''

In [16]:
%%time
# mpTimeDf['overlap2_delist'] = str(mpTimeDf['overlapping_words2'])
# *** **** **** Forceful filtering to keep brexit and leave words **** **** **** 
partyTimeDf['lemmas_delist'] = [','.join(map(str, l)) for l in partyTimeDf['Lemmas']]
#partyTimeDf = partyTimeDf[partyTimeDf['lemmas_delist'].str.contains(r'\bbrexit\b')]
#partyTimeDf = partyTimeDf[partyTimeDf['lemmas_delist'].str.contains(r'\bleave\b')]
partyTimeDf.shape

In [26]:
%%time
overlappingWords = []
overlappingWordsN = []

for ind in partyTimeDf.index:
    partyVocabInTime = partyTimeDf.at[ind, 'stemmed']

    overlap = list(set(partyVocabInTime).intersection(changeStem))
    overlapN = list(set(partyVocabInTime).intersection(noChangeStem))

    overlappingWords.append(overlap)
    overlappingWordsN.append(overlapN)

partyTimeDf['overlapping_words'] = overlappingWords
partyTimeDf['overlapCount'] = partyTimeDf.overlapping_words.map(len)


partyTimeDf['overlapping_words_n'] = overlappingWordsN
partyTimeDf['overlapCountN'] = partyTimeDf.overlapping_words_n.map(len)


In [27]:
partyTimeDf['overlapCountN'] = partyTimeDf.overlapping_words_n.map(len)


In [32]:
partyTimeDf[partyTimeDf.overlapCount>45]

In [35]:
partyTimeDf['overlap2_delist'] = [','.join(map(str, l)) for l in partyTimeDf['overlapping_words']]


In [36]:
partyTimeDfOverridden = partyTimeDf[partyTimeDf['overlap2_delist'].str.contains('leave')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('brexit')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('remain')]
'''partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('cliff')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('exit')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('trigger')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('triggered')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('triggering')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('withdraw')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('remainers')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('bill')]'''

'''partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('control')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('hard')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('soft')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('surrender')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('prorogation')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('departure')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('backroom')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('invoked')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('edge')]'''
#partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('unicorn')]


'''
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('gentleman')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('parliament')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('us')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('come')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('however')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('may')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('soft')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('soft')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('soft')]
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('soft')]'''
partyTimeDfOverridden.shape

In [37]:
partyTimeDfOverridden

In [22]:
#partyTimeDfOverridden['stemmed']=['s' for i in range(0,9)]
partyTimeDfOverridden['lemmatized']=['s' for i in range(0,9)]

In [23]:
partyTimeDfOverridden=partyTimeDfOverridden.reset_index()
partyTimeDfOverridden.index

In [26]:
#Commenting when using lemmatization instead of stemming
'''%%time
stemmer = SnowballStemmer("english")
colIndex = partyTimeDfOverridden.columns.get_loc('stemmed')
for index, row in partyTimeDfOverridden.iterrows():
    print(index,row['party'])
    if(len(partyTimeDfOverridden.iat[index,colIndex])>1):
        print(partyTimeDfOverridden.iat[index,colIndex][0])
        print('Already set')
        continue
    else:
       #=='s' or partyTimeDfOverridden.iat[index,colIndex].isnull()):
        print('Stemming')
        stemmed=  [stemmer.stem(y) for y in partyTimeDfOverridden.at[index,'Lemmas']] # Stem every word.
        print(len(stemmed))
        #partyTimeDfOverridden.loc[index, 'stemmed'] = stemmed
        partyTimeDfOverridden.iat[index,colIndex] = stemmed
    '''else:
        print('Already set')
        continue''''''

In [26]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
#lemma_change = [lemmatizer.lemmatize(x) for x in change]
#lemma_no_change = [lemmatizer.lemmatize(x) for x in no_change]
#print(len(change),len(set(lemma_change)))
#print(len(no_change),len(set(lemma_no_change)))

In [28]:
%%time

colIndex = partyTimeDfOverridden.columns.get_loc('lemmatized')

for index, row in partyTimeDfOverridden.iterrows():
    print(index,row['party'])
    if(len(partyTimeDfOverridden.iat[index,colIndex])>1):
        print(partyTimeDfOverridden.iat[index,colIndex][0])
        print('Already set')
        continue
    else:
       #=='s' or partyTimeDfOverridden.iat[index,colIndex].isnull()):
        print('Lemmatizing')
        lemmed= [lemmatizer.lemmatize(y) for y in partyTimeDfOverridden.at[index,'Lemmas']]
        #[stemmer.stem(y) for y in partyTimeDfOverridden.at[index,'Lemmas']] # Stem every word.
        print(len(lemmed))
        #partyTimeDfOverridden.loc[index, 'stemmed'] = stemmed
        
        partyTimeDfOverridden.iat[index,colIndex] = lemmed
    '''else:
        print('Already set')
        continue'''

In [38]:
len(dictSpeechesByParty.keys())

In [39]:
listDfsKeep = partyTimeDfOverridden['df_name'].to_list()
# Dropping key-value pairs from dictionary where the key doesn't match

for k,v in list(dictSpeechesByParty.items()):
    if (k not in listDfsKeep):
        del dictSpeechesByParty[k]

len(dictSpeechesByParty.values())

In [31]:
partyTimeDfOverridden['lemmatized']

In [173]:
#list(partyTimeDfOverridden[partyTimeDfOverridden['df_name']=='df_t2_Labour']['stemmed'])[0]

In [174]:
#Getting rid of this sardardi for the time being as order in dict and df seems to match

'''for k in dictSpeechesByParty.keys():
    #print(k, partyTimeDfOverridden[partyTimeDfOverridden['df_name']==k][['df_name']])  
    #print(type(partyTimeDfOverridden[partyTimeDfOverridden['df_name']==k]['stemmed']))
    #This creates list with faltu 0 
    #dictSpeechesByParty[k]['stemmed'] = [partyTimeDfOverridden[partyTimeDfOverridden['df_name']==k]['stemmed']]
    print('Setting for',k)
    dictSpeechesByParty[k].at[0, 'stemmed'] = [12,2,3]
    #list(partyTimeDfOverridden[partyTimeDfOverridden['df_name']==k]['stemmed'])[0]

    #print(len(partyTimeDfOverridden[partyTimeDfOverridden['df_name']==k]['stemmed']))
    print('---------------------------------------------------------------------------------')

#Check
dictSpeechesByParty['df_t2_Labour (Co-op)']'''

In [32]:
dictSpeechesByParty['df_t2_Labour (Co-op)'].dtypes
#dictSpeechesByParty['df_t2_Labour (Co-op)'].describe

In [33]:
'''%%time
#for lemList in list(partyTimeDfOverridden['stemmed']):
    #print(lemList)
lemList = list(partyTimeDfOverridden['stemmed'])
#print(lemList[0][0])

for ind,k in enumerate(dictSpeechesByParty.keys()):
    print(ind,k)
    dictSpeechesByParty[k].at[0,'stemmed']='s'
    print(dictSpeechesByParty['df_t2_Labour (Co-op)'].dtypes)
    stInd = dictSpeechesByParty[k].columns.get_loc('stemmed')
    dictSpeechesByParty[k].iat[0,stInd]=lemList[ind]'''

In [34]:
%%time
#for lemList in list(partyTimeDfOverridden['stemmed']):
    #print(lemList)
lemList = list(partyTimeDfOverridden['lemmatized'])
#print(lemList[0][0])

for ind,k in enumerate(dictSpeechesByParty.keys()):
    print(ind,k)
    dictSpeechesByParty[k].at[0,'lemmatized']='s'
    print(dictSpeechesByParty['df_t2_Labour (Co-op)'].dtypes)
    stInd = dictSpeechesByParty[k].columns.get_loc('lemmatized')
    dictSpeechesByParty[k].iat[0,stInd]=lemList[ind]

In [35]:
dictSpeechesByParty['df_t2_Scottish National Party']

In [106]:
'''for k in dictSpeechesByParty.keys():
    #print(k, partyTimeDfOverridden[partyTimeDfOverridden['df_name']==k][['df_name']])  
    print([partyTimeDfOverridden[partyTimeDfOverridden['df_name']==k]['stemmed']])
    dictSpeechesByParty[k]['stemmed'] = [partyTimeDfOverridden[partyTimeDfOverridden['df_name']==k]['stemmed']]
    print(len(partyTimeDfOverridden[partyTimeDfOverridden['df_name']==k]['stemmed']))
    print('---------------------------------------------------------------------------------')'''

In [None]:
'''%%time
stemmer = SnowballStemmer("english")

# Use English stemmer.
stemmer = SnowballStemmer("english")
#lemmaz = [token.lemma_.lower() for token in doc if token.is_alpha and token.text.lower() not in stop_words]
partyTimeDfOverridden['stemmed'] = partyTimeDf['Lemmas'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
'''

In [117]:
partyTimeDfOverridden = partyTimeDfOverridden[partyTimeDfOverridden['overlap2_delist'].str.contains('unicorn')]
partyTimeDfOverridden.shape
partyTimeDfOverridden.shape

In [85]:
partyTimeDf2 = partyTimeDf[partyTimeDf['overlap2_delist'].str.contains('unicorn')]
partyTimeDf2.shape

In [None]:
'''# We should only keep speeches by those MPs which have occurences in both T1 & T2 since comparison in their vocab is to be made here

#list(vocabDf['mnis_id'].value_counts().value_counts())
#340 MPs have speeches in both T1 and T, 241 have speech(es) in one of the time periods

mpCountDict = vocabDf['mnis_id'].value_counts().to_dict()

vocabDf['mpCountInTimes'] = vocabDf['mnis_id'].map(mpCountDict)

#Drop the MPs with selected vocab in only one time interval

vocabDf=vocabDf[vocabDf['mpCountInTimes']==2]

# For manual verification'''
'''vocabDf[vocabDf['mnis_id']=='3942']
3992, 3942 - only one instance of each
4263 - 2 instances'''

In [None]:
'''# Now backtrack and update dictionary to only keep DFs of these 921 rows

# Creating dict key names to keep from our filtered DF mnis IDs

mnis_id_list = list(vocabDf['mnis_id'].value_counts().to_dict().keys())

list1 = ['df_t1_'+item for item in mnis_id_list]
list2 = ['df_t2_'+item for item in mnis_id_list]
mnis_id_list = list1 + list2'''

In [90]:
dictSpeechesByParty.keys()

In [36]:
dictSpeechesByParty['df_t2_Labour (Co-op)']

In [40]:
%%time

#  - x - x - x - CREATING & SAVING WORD2VEC MODELS FOR THE 680 MODELS - x - x - x 

dictOfModels = {}
#shutil.rmtree('./models-by-mp-and-time')
#os.makedirs('./models-by-party-and-time')
#models_folder = './models-by-party-and-time'
count = 1

for dframe in dictSpeechesByParty: 
    
# Doing in batches since notebook RAM crashe
    print(dictSpeechesByParty[dframe]['df_name'])
    print('Hello', dictSpeechesByParty[dframe]['stemmed'])
    model = gensim.models.Word2Vec(dictSpeechesByParty[dframe]['stemmed'], min_count=1, vector_size=300, window = 5, sg = 1)

    # Also saving model in a dict and exporting

    modelName ='model_'+ dframe
    print('model number', count, modelName)

    dictOfModels[dframe] = model
    #model.save(os.path.join(models_folder, modelName))
    count = count +1
    


In [41]:
%%time


modelsToAlign = list(dictOfModels.values())
for i in range(0,len(modelsToAlign)-1):
    functools.reduce(smart_procrustes_align_gensim, modelsToAlign)

In [41]:
dictOfModels

In [42]:
# Check if aligned

for ind in range(0,len(listDfsKeep)-1):
    if(len(dictOfModels[listDfsKeep[ind]].wv.index_to_key)!=len(dictOfModels[listDfsKeep[ind+1]].wv.index_to_key)):
        print('Vocabs not similar')
        
print('Vocab Size', len(dictOfModels[listDfsKeep[ind]].wv.index_to_key))

#Yes Aligned, 5384 Vocab size
# Vocab Size 3686 after stemming
# Vocab Size after mennatizing = 4844

#Vocab size 3606 after stemming and only considering common words - brexit, leave, remain

In [45]:
len(set(dictOfModels[listDfsKeep[ind]].wv.index_to_key).intersection(changeStem))
   # overlap = list(set(partyVocabInTime).intersection(changeStem))


In [46]:
len(set(dictOfModels[listDfsKeep[ind]].wv.index_to_key).intersection(noChangeStem))


In [48]:
'''# To delete if need be
import shutil
shutil.rmtree('./models-by-party-and-time')

os.remove('./file.zip')'''

In [47]:
%%time
os.makedirs('./models-by-party-and-time')
models_folder = './models-by-party-and-time'

for k in dictOfModels.keys(): 
    dictOfModels[k].save(os.path.join(models_folder, k))


In [48]:
'''%%time
# Commenting since we're going to load and start with alignment of models with a fresh notebook
functools.reduce(smart_procrustes_align_gensim, list(dictOfModels.values()))'''

In [50]:
%%time
!zip -r file.zip /kaggle/working/models-by-party-and-time

# **OPTIONAL SIMILARITY CHECK, CAN SKIP**

In [45]:
dictOfModels
for i in dictOfModels.keys():
  print(dictOfModels[i], dictOfModels[i].wv.similar_by_word('brexit', 10))
  print('- x - x - x - x - x - x - x - x - x')