In [75]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import gensim
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import csv
from csv import reader
from scipy import spatial
import functools
from collections import Counter
import nltk
from nltk.data import load
tagdict = load('help/tagsets/upenn_tagset.pickle')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# ****Getting lemmas and metadata together****

In [10]:
%%time
# open file in read mode
with open('/kaggle/input/utf8tokenizedspeeches/TokenizedSpeeches_utf-8.csv', 'r') as read_obj:
    lemmasList = []
    
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Iterate over each row in the csv using reader object
    for row in csv_reader: 
        lemmasList.append(row)
    print(len(lemmasList), 'Rows read')

In [11]:
%%time
#create dataframe from the lemmas extracted from csv
dictOfLemmas = {'Lemmas': lemmasList}
lemmasDf = pd.DataFrame(dictOfLemmas)

In [12]:
lemmasDf

In [13]:
with open('/kaggle/input/hansard-speeches-lemmatized/hansard-speeches-post2010.pkl', 'rb') as f:
    df = pickle.load(f)

In [14]:
#since index was missing values and didn't match with the lemmasDf index
df = df.reset_index(drop=True)

In [15]:
df = df.join(lemmasDf)

# Dividing and training corpus before and after the Brexit referendum

In [16]:
# Split data based on the Brexit referendum event before and after period
eventDate = '2016-06-23 23:59:59'
df_t1 = df[df['date']<= eventDate]
df_t2 = df[df['date']> eventDate]

In [17]:
df_t1.shape

# **Eliminating records for MPs that switched parties (optional step for Retrofitting steps)**

In [18]:
df_t2.shape

In [19]:
df_t1[df_t1['display_as']=='Boris Johnson'].shape

In [20]:
df_t2[df_t2['display_as']=='Boris Johnson'].shape

In [21]:
df_t1['date'].describe()

In [22]:
df_t2['date'].describe()

In [23]:
#Code to check if any MP has changed parties in a corpus , the difference in the numbers points to the records of such MPs

print(len(df_t2['mnis_id'].value_counts()))
print(len(df_t2[['mnis_id','party']].value_counts()))

'''df_t1[df_t1['mnis_id']=='1435'] 
#In conservative and in independent parties, check after removing

Examples from df_t2 - ('105', 'Conservative'), ('105', 'Independent'), ('116', 'Conservative'), ('116', 'Independent'), ('1201', 'Conservative'), ('1201', 'Independent'), ('1511', 'Labour'), ('1511', 'Independent'), ('1523', 'Conservative'), ('1523', 'Independent'),

Could have changed more than once, e.g.('3938', 'Conservative'), ('3938', 'Independent'), ('3938', 'The Independent Group for Change'), ('3938', 'Change UK - The Independent Group'''

In [24]:
def removePartyChangingMPs(dfPartyChange):
    
    # Grouping and value counts to get pairs of MPs and parties (this results in a multi-index being created)
    listMultiIndex = list(dfPartyChange.groupby('mnis_id')['party'].value_counts().to_frame().index)
    # Extract the first element from each tuple in the list - the mnis id,
    Mnis_IDs = [x[0] for x in listMultiIndex]
    
    # Then we will look for mnis IDs which have appeared more than once pointing to a change in party
    partyChangeMPs = [item for item, count in Counter(Mnis_IDs).items() if count > 1]
    print('MPs that changed parties in the corpus -', partyChangeMPs)

    Mnis_IDs = [x for x in Mnis_IDs if x not in partyChangeMPs]
    print(str(len(Mnis_IDs))+ ' MPs did not change party in the corpus')
    
    return dfPartyChange[~dfPartyChange['mnis_id'].isin(Mnis_IDs)]

In [None]:
print('Before removal of MPs that changed parties',df_t1.shape)
df_t1 = removePartyChangingMPs(df_t1)
print('After removal of MPs that changed parties',df_t1.shape)

print('Before removal of MPs that changed parties',df_t2.shape)
df_t2 = removePartyChangingMPs(df_t2)
print('After removal of MPs that changed parties',df_t2.shape)

In [25]:
%%time
model1 = gensim.models.Word2Vec.load('../input/models12/model1')
model2 = gensim.models.Word2Vec.load('../input/models12/model2')


In [26]:
%%time
'''model1 = gensim.models.Word2Vec(df_t1['Lemmas'], min_count=1, vector_size=300, window = 5, sg = 1)
model2 = gensim.models.Word2Vec(df_t2['Lemmas'], min_count=1, vector_size=300, window = 5, sg = 1)'''

'''
# Saving models 

os.makedirs('./original-models')
model1.save('./original-models/model1')
model2.save('./original-models/model2')
!zip -r file.zip /kaggle/working/original-models'''

# Choosing intersecting vocabulary and Aligning models 

In [27]:
def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """
    print('Vocab function')
    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)
    print('vocabs',list(common_vocab)[0], list(vocab_m1)[0], list(vocab_m2)[0])
    print('1. Common vocab length', len(common_vocab))

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))
    print('2. Common vocab length', len(common_vocab))
    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
    print('Vocab function returning models with shapes', m1.wv.vectors.shape, m2.wv.vectors.shape)    
    return (m1,m2)

In [28]:
# Function to align two spaces with orthogunal procrustes
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    print('shapes', base_embed.wv.vectors.shape, other_embed.wv.vectors.shape)
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """
    print('Hillow hillow.. In Smart procrustes align gensim function, received',base_embed, other_embed)

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)
    print('a, the shapes received are', in_base_embed.wv.vectors.shape, other_embed.wv.vectors.shape)
    print(str(len(list(in_base_embed.wv.index_to_key))))
    print(str(len(list(in_other_embed.wv.index_to_key))))

    # re-filling the normed vectors
    in_base_embed.wv.fill_norms(force=True)
    in_other_embed.wv.fill_norms(force=True)
    
    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    
    print('b')

    other_vecs = in_other_embed.wv.get_normed_vectors()
    print('c')
    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    print('d')

    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    print('e')

    # another matrix operation
    ortho = u.dot(v) 
    print('f')

    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    print('Procrustes function returning')
    return other_embed

In [29]:
%%time
# Applying the functions to our models

smart_procrustes_align_gensim(model1, model2, words=None)

In [None]:
%%time
'''
model_df_t1_103 = gensim.models.Word2Vec.load('../input/multiplemodelstrial/model_df_t1_103')
model_df_t1_112 = gensim.models.Word2Vec.load('../input/multiplemodelstrial/model_df_t1_112')
model_df_t1_114 = gensim.models.Word2Vec.load('../input/multiplemodelstrial/model_df_t1_114')
model_df_t1_116 = gensim.models.Word2Vec.load('../input/multiplemodelstrial/model_df_t1_116')
model_df_t1_151 = gensim.models.Word2Vec.load('../input/multiplemodelstrial/model_df_t1_151')
'''


In [None]:
'''print('Original lengths',len(model_df_t1_103.wv.index_to_key), len(model_df_t1_112.wv.index_to_key), len(model_df_t1_114.wv.index_to_key),len(model_df_t1_116.wv.index_to_key), len(model_df_t1_151.wv.index_to_key))
print('lengths',len(model_df_t1_103.wv.index_to_key), len(model_df_t1_112.wv.index_to_key), len(model_df_t1_114.wv.index_to_key),len(model_df_t1_116.wv.index_to_key), len(model_df_t1_151.wv.index_to_key))'''

In [None]:
%%time

#listOfModels = [model_df_t1_103,model_df_t1_112,model_df_t1_114,model_df_t1_116, model_df_t1_151]
#otherEmbedReturned = smart_procrustes_align_gensim(otherEmbedReturned,model_df_t1_116)
#otherEmbedReturned == model_df_t1_103
#otherEmbedReturned == model_df_t1_112

In [13]:
model1.wv.similarity("brexit", "leave")


In [14]:
# after alignment 
model1.wv.similarity("brexit", "leave")


In [15]:
model2.wv.similarity("brexit", "leave")


In [16]:
model1.wv.similarity("brexit", "remain")


In [17]:
model2.wv.similarity("brexit", "remain")


In [37]:
model1.wv.similarity("snowflake", "generation")


In [43]:
for i, j in enumerate(model1.wv.index_to_key):
    if (j=='snowflake'):
        print('yes', i)

In [44]:
for i, j in enumerate(model2.wv.index_to_key):
    if (j=='snowflake'):
        print('yes', i)

In [80]:
model2.wv.similarity("snowflake", "generation")


# Measuring change in words

In [104]:
def cosine_similarity(word):
  sc = 1-spatial.distance.cosine(model1.wv[word], model2.wv[word])
  return sc

In [105]:
%%time
cosine_similarity_df = pd.DataFrame(([w, cosine_similarity(w), model1.wv.get_vecattr(w, "count") , model2.wv.get_vecattr(w, "count") ] for w in model1.wv.index_to_key), columns = ('Word', 'Cosine_similarity', "Frequency_t1", "Frequency_t2"))
# ??
cosine_similarity_df['FrequencyRatio'] = cosine_similarity_df['Frequency_t1']/cosine_similarity_df['Frequency_t2']
cosine_similarity_df['TotalFrequency'] = cosine_similarity_df['Frequency_t1'] + cosine_similarity_df['Frequency_t2']
cosine_similarity_df_sorted = cosine_similarity_df.sort_values(by='Cosine_similarity', ascending=True)

In [106]:
# Entire corpus cosine similarity
cosine_similarity_df['Cosine_similarity'].describe()
hist = cosine_similarity_df['Cosine_similarity'].hist()


In [107]:
# Experiments with frequency change to see frequency change in t1 & t2
'''print(cosine_similarity_df['FrequencyRatio'].describe())
print(cosine_similarity_df[cosine_similarity_df['FrequencyRatio']<1.6]['Cosine_similarity'].describe())
print(cosine_similarity_df[cosine_similarity_df['FrequencyRatio']>1.83]['Cosine_similarity'].describe())
print (cosine_similarity_df[cosine_similarity_df['FrequencyRatio']<1.83]['Cosine_similarity'].describe())'''

In [125]:
# Shouldn't change list of words 
common_100_words = list(cosine_similarity_df.sort_values(by='TotalFrequency',ascending=False).head(100)['Word'])
common_100_words

In [109]:
# Experiments with outlier frequency ratios
'''cosine_similarity_df_sorted[cosine_similarity_df_sorted['FrequencyRatio']>=3.000000].sort_values(by = 'Cosine_similarity', ascending = True).head(50)
cosine_similarity_df_sorted[cosine_similarity_df_sorted['FrequencyRatio']<=1.500000].sort_values(by = 'Cosine_similarity', ascending = True).head(50)

cosine_similarity_df_sorted.tail(100).Word.tolist()'''

In [132]:
# We should now filter and only consider words that have a decent frequency of usage 
cosine_similarity_df_sorted = cosine_similarity_df_sorted.loc[cosine_similarity_df_sorted['Frequency_t1'] + cosine_similarity_df_sorted['Frequency_t2'] > 20].sort_values(by='Cosine_similarity', ascending = True)


In [133]:
%%time
cosine_similarity_df_sorted['wordCategory'] = nltk.pos_tag(cosine_similarity_df_sorted['Word'])

cosine_similarity_df_sorted['wordCategory'] = cosine_similarity_df_sorted.apply(lambda row: row['wordCategory'][1], axis = 1)
cosine_similarity_df_sorted['wordCategoryDescription'] = cosine_similarity_df_sorted.apply(lambda row: tagdict[row['wordCategory']], axis = 1)

#cosine_similarity_df_sorted['wordCategory'].value_counts()
#cosine_similarity_df.groupby('wordCategoryDescription', as_index=False)['Cosine_similarity'].mean().sort_values(by='Cosine_similarity', ascending=True)

#Checking for conjunctions and prepositions but lots of spam words were categorized so 
#cosine_similarity_df[cosine_similarity_df['wordCategory']=='IN'].sort_values(by='Cosine_similarity',ascending=True)


In [134]:
# Words we expect little change in (Because of high frequency of usage)
# Manually to eliminate words like 'leave', 'deal'

cosine_sim_small_change_expected = cosine_similarity_df_sorted[cosine_similarity_df_sorted['Word'].isin(common_100_words)]

# Dropped word 'deal' since that's one word we have injected in the list for words expected to change
cosine_sim_small_change_expected.drop(cosine_sim_small_change_expected.index[cosine_sim_small_change_expected['Word'] == 'deal'], inplace=True)
print(cosine_sim_small_change_expected['Cosine_similarity'].describe())

cosine_sim_small_change_expected.head(30)

In [135]:
cosine_sim_small_change_expected = cosine_similarity_df_sorted[cosine_similarity_df_sorted['Word'].isin(common_100_words)]

# Dropped word 'deal' since that's one word we have injected in the list for words expected to change
cosine_sim_small_change_expected.drop(cosine_sim_small_change_expected.index[cosine_sim_small_change_expected['Word'] == 'deal'], inplace=True)

print(cosine_sim_small_change_expected['Cosine_similarity'].describe())
cosine_sim_small_change_expected['Cosine_similarity'].hist()

In [79]:
# Change not expected - 
'''NN     42
JJ     18
VB      8
RB      7
NNS     5
MD      5
IN      3
CD      3
VBP     2
VBD     2
VBN     2
VBZ     1
PRP     1
VBG     1


NN, NNS Nouns- 47%
JJ Adjectives 18%
Verbs - 16%
RB Adverbs 7%
MD Modals 5%
IN Prepositions or conjunctions - 3%
CD - Numerals or cardinals 3% 
Pronouns - 1%'''

# change expected DF list - 
'''NN     14
JJ      7
VBD     4
VBG     3
NNS     2
RB      2
VBP     1
CC      1
VB      1'''
'''(NN, NNS) Nouns  45%, 
(JJ) Adjectives 20% , 
(VBD, VBG, VB, VBP) Verbs 26%, 
RB Adverb 6%'''

In [156]:
'''brexitNeighbours = model1.wv.similar_by_word('brexit', 100) + model2.wv.similar_by_word('brexit',100)
brexitNeighbours = [x[0] for x in brexitNeighbours]

# Using initial list of words 
# Words we expect more change in (Brexit-relevant mainly)
# Identified by scanning articles on the topic and manually scanning low frequency words( >150 though)

brexit_words = ['brexit','leave','remain','europe', 'eurozone', 'euro', 'UKIP', 'borders', 'border', 'lines', 'career',
                'careers', 'population', 'group', 'community','eu','unicorn','backstop','exit','brexiting',
                'bregret','brexitesque', 'allies', 'investors', 'legacy', 'challenging', 'remaining', 'educational',
                'appalling', 'independence', 'ballot','equal','lies', 'promises', 'forget']

brexit_words = brexit_words + brexitNeighbours 
cosine_sim_change_expected = cosine_similarity_df_sorted[cosine_similarity_df_sorted['Word'].isin(brexit_words)]
print(cosine_sim_change_expected['Cosine_similarity'].describe())
cosine_sim_change_expected.head(30)'''

In [187]:
brexit_words = ['leavers','remainers', 'remainiacs', 'milkshake', 'snowflake', 'snowflakes','unicorn', 'unicorns','stockpiling',
                'prorogation', 'soft', 'hard','brexiety', 'breferendum' 'brextremist', 'brexistential', 'brextension',
                'flextension', 'brexiter','brexiteer', 'brexitology', 'brexitsphere','brentry', 'brex',
                'regrexit','remoaners', 'cakeism', 'cake','prorogue', 'prorogation','prorogued', 'managed', 
                'canada', 'norway', 'plus','withdrawal','agreement', 'foreign','surrender', 'bill','war',
                'article','50', 'no', 'deal', 'invoke','trigger', 'invoked','triggered', 'triggering','invoking',
                'oven','ready','control',  'red', 'line', 'cliff','edge', 'gammon', 'blindfold']

# From general knowledge + news sources of topics around Brexit 
other_brexit_words = ['backroom', 'euroscepticism', 'referendum', 'expulsion', 'seaborne', 'eurotunnel']

uncommon_100_words = list(cosine_similarity_df[cosine_similarity_df['TotalFrequency']>200].sort_values(by='TotalFrequency',ascending=True).head(100)['Word'])


neighbourTuples=[]
brexitNeighbours = []

# Choosing neighbours of only very-brexit words
neighbourKeys = ['brexit','exit', 'leave', 'withdrawal','departure', 'departures', 'remainers','surrender', 
                'brexiteer', 'brexiteers', 'euro']

for i in neighbourKeys:
    neighbourTuples = model1.wv.similar_by_word(i, 5) + model2.wv.similar_by_word(i,5)
    brexitNeighbours += [x[0] for x in neighbourTuples]
    
    

# Including neighbours of words with most semantic shift and of prime importance with the context of Brexit
# Raises the mean cos similarity slightly but we get more diversity

brexit_words += other_brexit_words + uncommon_100_words[0:10] + brexitNeighbours 


cosine_sim_change_expected = cosine_similarity_df_sorted[cosine_similarity_df_sorted['Word'].isin(brexit_words)]

print(cosine_sim_change_expected['Cosine_similarity'].describe())
#cosine_sim_change_expected['Cosine_similarity'].hist()

In [196]:
notShouldChange = cosine_sim_small_change_expected['Word'].to_list()
shouldChange = cosine_sim_change_expected['Word'].to_list()

In [198]:
dictLists = {'shouldChange': shouldChange, 'notShouldChange': notShouldChange}

In [204]:
len(notShouldChange)

In [195]:
shouldChange

# **BELOW CODE NEED NOT BE EXECUTED**

In [None]:
# Snippets and experiments
words_chosen = ['brexit', 'leave', 'remain', 'europe', 'eurozone', 'euro', 'UKIP', 'population', 'group', 'community']
'''cosine_similarity_chosen = cosine_similarity_df_sorted[cosine_similarity_df_sorted['Word'].isin(words_chosen)]
cosine_similarity_chosen

#Chosen most working reverse - manually analysed ones with small change, to discuss with Chico
#words_chosen_expected_no_change = [ 'population', 'group', 'community', 'said', 'tell', 'told', 'get', 'however', 'year', 'given', 'might']
#cosine_similarity_df_sorted[cosine_similarity_df_sorted['Word'].isin(words_chosen_expected_no_change)]'''

In [None]:
'''# Checking lower similarity words for any brexit related ones
cosine_similarity_df_sorted[cosine_similarity_df_sorted['TotalFrequency']>100].sort_values(by='Cosine_similarity').head(50)

cosine_similarity_df[cosine_similarity_df['TotalFrequency']>200].sort_values(by='TotalFrequency',ascending=True).head(100)['Cosine_similarity'].describe()'''

In [30]:
print( 'A', model1.wv.similar_by_word('withdrawal', 10))
print(' -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x')                       
print(model2.wv.similar_by_word('withdrawal',10))


In [31]:
print( 'A', model1.wv.similar_by_word('snowflake', 10))
print(' -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x')                       
print(model2.wv.similar_by_word('snowflake',10))


In [65]:
print( 'A', model1.wv.similar_by_word('remainers', 10))
print(' -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x')                       
print(model2.wv.similar_by_word('remainers',10))


In [64]:
print( 'A', model1.wv.similar_by_word('surrender', 10))
print(' -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x')                       
print(model2.wv.similar_by_word('surrender',10))


In [66]:
print( 'A', model1.wv.similar_by_word('triggering', 10))
print(' -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x')                       
print(model2.wv.similar_by_word('triggering',10))


In [67]:
print( 'A', model1.wv.similar_by_word('stockpiling', 10))
print(' -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x -x')                       
print(model2.wv.similar_by_word('stockpiling',10))


# Splitting speeches by MPs and time

In [None]:
# Code to create word2Vec models and save them
# Commenting since we have already done that and are going to load them 
'''%%time
# Lets only consider the MPs who have speeches in both T1 & T2 time periods, intersection between two DFs 

t1List = pd.unique(df_t1['mnis_id']).tolist()
t2List = pd.unique(df_t2['mnis_id']).tolist()

intersectedList = list(set(t1List).intersection(t2List))

print(len(intersectedList))
# 651 MPs to be considered


#  --------- new cell --------

# Now modify original df_t1 and df_t2 DFs to only contain speeches from the common MPs 

df_t1 = df_t1[df_t1['mnis_id'].isin(intersectedList)]
df_t2 = df_t2[df_t2['mnis_id'].isin(intersectedList)]


# ---------new cell ------------

%%time
# New code for dividing corpus by time and MPs 

dictSpeechesByMp = {}

for mpId in intersectedList:
    for dfTime in ['df_t1','df_t2']:
        dfName = dfTime + '_'+ mpId
        if(dfTime == 'df_t1'):
            dictSpeechesByMp[dfName]=df_t1[df_t1['mnis_id']==mpId]
        elif (dfTime == 'df_t2'):
            dictSpeechesByMp[dfName]=df_t2[df_t2['mnis_id']==mpId]

# -----------new cell ----------------
# Do not run, for verification only

len(dictSpeechesByMp.keys()) 
# Result 1302 

df_t1['mnis_id'].value_counts()
df_t2['mnis_id'].value_counts()
# Result is 651 each - hence all covered in our dictionary DFs 

# --------------new cell ---------------

os.makedirs('./models-by-mp-and-time')
 
# --------------new cell ---------------

%%time 
models_folder = './models-by-mp-and-time'

count =1
for dframe in dictSpeechesByMp: 
    #print(dframe)
    if (count<652):
        model = gensim.models.Word2Vec(dictSpeechesByMp[dframe]['Lemmas'], min_count=1, vector_size=300, window = 5, sg = 1)
        modelName ='model_'+ dframe
        print('Saving model number', count, modelName)

        model.save(os.path.join(models_folder, modelName))
    elif(count>651):
        print('651 files done and exiting')
        break
    count = count+1
    #if(count==2):
     #   break;
        
# ------------ new cell --------
%%time
!zip -r file.zip /kaggle/working/models-by-mp-and-time

last -adding: kaggle/working/models-by-mp-and-time/model_df_t2_450 (deflated 8%)'''