In [9]:
from pathlib import Path
from top2vec import Top2Vec
import pickle
import numpy as np

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags

In [10]:
DATA_DIR = Path('./data')
def load(filename):
    f = open(DATA_DIR/filename,"rb")
    return pickle.load(f)
    
def save(data, filename):
    with open(DATA_DIR/filename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:

file = open("moties_processed_df.pickle","rb")
df = pickle.load(file)
print('before removal empty texts',len(df))

# remove moties without text
mask = (df['Text']=='') | (df['Text'].isna())
df = df.loc[~mask]
print('after removal empty texts',len(df))

documents = df['Text'].values

before removal empty texts 29514
after removal empty texts 29484


In [12]:
print(len(df))
stem_column = [c for c in df.columns if 'Stem_' in c and c != 'Stem_persoon']
stem_column_adj = [c[5:] for c in df.columns if 'Stem_' in c and c != 'Stem_persoon']
stem_array = df[stem_column].values.tolist()
assert len(stem_array[0]) == len(stem_column)
voor = [[stem_column_adj[i] for i, stem in enumerate(motie) if stem == 1] for motie in stem_array]
tegen = [[stem_column_adj[i] for i, stem in enumerate(motie) if stem == 0] for motie in stem_array]
df['Partijen_Voor'] = voor
df['Partijen_Tegen'] = tegen
df['Index']=list(range(len(df)))

29484


In [18]:
# got the code for making the bigram part from https://lppier.github.io/
sentence_stream = [simple_preprocess(strip_tags(doc), deacc=True) for doc in documents]
bigram = Phrases(sentence_stream, min_count=10)
bigram_phraser = Phraser(bigram)

indieners = {indiener[-1].lower() for indiener in df['Indiener_persoon'].str.split() if indiener}
years = {word for doc in sentence_stream for word in doc if word.startswith('x') and not word.startswith('xin')}
manual_stopwords = {'faber', 'lacin', 'kroger', 'iv', 'beschikt', 'die', 'vaststelling', 'lid','vi', 'viii', 'iii', 'iv', 'ii', 'i'}
stopwords = indieners | years | manual_stopwords
   
def bigram_stopword_preprocess(doc):
    sentence_stream = simple_preprocess(strip_tags(doc), deacc=True)
    sentence_stream = [word for word in sentence_stream if word not in stopwords]
    return bigram_phraser[sentence_stream]

In [19]:
model = Top2Vec(documents, speed='deep-learn', tokenizer=bigram_stopword_preprocess, workers=4)
model.get_num_topics()

2021-03-03 12:21:44,496 - top2vec - INFO - Pre-processing documents for training
2021-03-03 12:23:30,780 - top2vec - INFO - Creating joint document/word embedding
2021-03-03 13:16:48,205 - top2vec - INFO - Creating lower dimension embedding of documents
2021-03-03 13:18:06,879 - top2vec - INFO - Finding dense areas of documents
2021-03-03 13:18:13,374 - top2vec - INFO - Finding topics


247

In [20]:
# model.save("data/doc2vec_deep_bigram_enhanced_stopwords")

In [13]:
model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords")
model.get_num_topics()

247

In [21]:
def get_reduced_topics(num_topics):
    print(f'performing reduction to {num_topics} topics')
    reduced_topics = model.hierarchical_topic_reduction(num_topics)
    topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
    reduced_topics = tuple(tuple(sorted(t)) for t in reduced_topics)
    return reduced_topics, topic_words

def find_diff(reduced1, reduced2, reverse=False):
    # find the topics that where merged and return their index
    if not reverse:
        changed = set(reduced1) - set(reduced2)
        return [index for index, topic in enumerate(reduced1) if topic in changed]
    else:
        changed = set(reduced2) - set(reduced1)
        return [index for index, topic in enumerate(reduced2) if topic in changed]


def print_merge(large, small, num_words=50):
    print(f'\ninspecting difference from {len(large[0])} to {len(small[0])} topics')
    print('old topics')
    for i in find_diff(large[0],small[0]):
        print(large[1][i][:num_words])
    print('new topic')
    for i in find_diff(large[0],small[0], reverse=True):
        print(small[1][i][:num_words])

def find_optimal_num_topics():
    start = 17
    stop = 6
    reductions = {i: get_reduced_topics(i) for i in range(start, stop, -1)}
    for i in range(start, stop +1,-1):
        print(i)
        print_merge(reductions[i], reductions[i-1])
# find_optimal_num_topics()

In [14]:
# Make a choice to how many topics to reduce
num_topics = 16
reduced_topics = model.hierarchical_topic_reduction(num_topics)


In [15]:
# inspect number of documents for each topic
topic_sizes, topic_nums = model.get_topic_sizes(reduced=True)
topic_words, word_scores, topic_nums = model.get_topics(reduced=True)

In [16]:
for i, t in enumerate(topic_words):
    print(i,t)

0 ['onderwijs_cultuur' 'wetenschap_viii' 'onderwijs' 'leerlingen'
 'voortgezet_onderwijs' 'viii' 'scholen' 'primair_onderwijs' 'mbo'
 'volwassenen_educatie' 'leerling' 'hoger_onderwijs' 'passend_onderwijs'
 'studenten' 'leraren' 'diploma' 'docenten' 'hbo' 'vakken' 'onderwijsveld'
 'funderend_onderwijs' 'docent' 'talenten' 'school' 'opleiding' 'celik'
 'vo' 'mbo_studenten' 'opleidingen' 'schooljaar' 'basisonderwijs'
 'hogescholen' 'curriculum' 'vmbo' 'havo' 'student' 'eindtoets'
 'bekostiging' 'leraar' 'voortgezet_speciaal' 'wet_educatie' 'doorstroom'
 'lerarentekort' 'middelbaar' 'schooladvies' 'wetenschap' 'werkdruk'
 'speciaal_onderwijs' 'klas' 'expertisecentra']
1 ['buitenlandse_zaken' 'jbz_raad' 'landen' 'vn' 'humanitaire'
 'buitenlandse_handel' 'mensenrechten' 'afghanistan' 'syrie' 'jemen'
 'vluchtelingen' 'saudi_arabie' 'turkije' 'bestrijding_internationaal'
 'actuele_situatie' 'verenigde_naties' 'internationale'
 'buitenlands_beleid' 'unhcr' 'libie' 'palestijnse_autoriteit' 'oor

In [17]:
topic_sizes, topic_nums = model.get_topic_sizes(reduced=False)
topic_words, word_scores, topic_nums = model.get_topics(reduced=False)
climate_idx = [topic_nums for t in topic_nums if t in reduced_topics[10]]


In [18]:
doc_ids = list(range(len(documents)))
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=True)

In [19]:
topics = {
  0: 'Onderwijs',  
  1: 'Buitenlandse zaken',  
  2: 'Algemene zaken',  
  3: 'Natuur & gaswinning',  
  4: 'Landbouw & dierenwelzijn',  
  5: 'Zorg',  
  6: 'Sociale zaken',  
  7: 'Justitie',  
  8: 'Pensioenstelsel',  
  9: 'Europese Unie',  
  10: 'Klimaat & energie',  
  11: 'Milieu & regelgeving',  
  12: 'Zorg',  
  13: 'Openbaar vervoer',  
  14: 'Financiele sector',  
  15: 'Wonen'
  }

doc_ids = list(range(len(documents)))
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=True)
topic_names = [topics[t] for t in topic_nums]
assert len(topic_nums) == len(df)
df['Topic'] = topic_names
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=False)
df['Topic_initial'] = topic_nums

In [20]:
df['Topic_initial']

2009Z00479     61
2009Z00488     61
2009Z00489     61
2009Z00490     61
2009Z00483     54
             ... 
2020Z25826     33
2020Z25819    104
2020Z25825    104
2020Z25824    104
2020Z25822     30
Name: Topic_initial, Length: 29484, dtype: int32

## Optional add climate deepdive

In [21]:
climate_subtopics = {128: 'Afhankelijkheid fossiele brandstoffen',
 165: 'CO2 reductie',
 5: 'Voldoen aan Parijs',
 141: 'Electriciteit',
 126: 'Groningen',
 205: 'Zonnepanelen',
 96: 'Energierekening betalen',
 236: 'Biomassa',
 105: 'Kolencentrales',
 239: 'Windturbines - overlast',
 29: 'Windturbines - subsidie'}

In [22]:
df['Klimaat'] = df.loc[df['Topic']=='Klimaat & energie', 'Topic_initial'].map(climate_subtopics)

In [23]:
save(df[:1000], 'df_including_topics.pickle')
save(df, 'df_including_topics_full.pickle')

# Prepare slimmed down versions for production

In [51]:
df = load("df_including_topics_full.pickle")
print(len(df))
df = df[df['Kamer']=='Rutte III']

# important do this only after all row filters have been set
df.index = df['Index']


stem_column = [c for c in df.columns if 'Stem_' in c and c != 'Stem_persoon']
required_cols = ['Kamer', 'Jaar','Indienende_partij', 'BesluitSoort','BesluitTekst','Topic_initial', 'Indienende_persoon_partij','Partijen_Voor', 'Partijen_Tegen']

# streamlit has problems with category type: 
# https://github.com/streamlit/streamlit/issues/47
# for col in stem_column + required_cols[:-4]:
#     df[col] = df[col].astype('category')

df = df[stem_column + required_cols]
print(len(df))
save(df, "df_production.pickle")


29484
11448


In [50]:
list(df.index)

[18036,
 18037,
 18038,
 18039,
 18040,
 18041,
 18042,
 18043,
 18044,
 18045,
 18046,
 18047,
 18048,
 18049,
 18050,
 18051,
 18052,
 18053,
 18054,
 18055,
 18056,
 18057,
 18058,
 18059,
 18060,
 18061,
 18062,
 18063,
 18064,
 18065,
 18066,
 18067,
 18068,
 18069,
 18070,
 18071,
 18072,
 18073,
 18074,
 18075,
 18076,
 18077,
 18078,
 18079,
 18080,
 18081,
 18082,
 18083,
 18084,
 18085,
 18086,
 18087,
 18088,
 18089,
 18090,
 18091,
 18092,
 18093,
 18094,
 18095,
 18096,
 18097,
 18098,
 18099,
 18100,
 18101,
 18102,
 18103,
 18104,
 18105,
 18106,
 18107,
 18108,
 18109,
 18110,
 18111,
 18112,
 18113,
 18114,
 18115,
 18116,
 18117,
 18118,
 18119,
 18120,
 18121,
 18122,
 18123,
 18124,
 18125,
 18126,
 18127,
 18128,
 18129,
 18130,
 18131,
 18132,
 18133,
 18134,
 18135,
 18136,
 18137,
 18138,
 18139,
 18140,
 18141,
 18142,
 18143,
 18144,
 18145,
 18146,
 18147,
 18148,
 18149,
 18150,
 18151,
 18152,
 18153,
 18154,
 18155,
 18156,
 18157,
 18158,
 18159,
 18160,


In [46]:
df.loc[18036, 'Jaar']

2017

In [42]:
df.index

Int64Index([18036, 18037, 18038, 18039, 18040, 18041, 18042, 18043, 18044,
            18045,
            ...
            29474, 29475, 29476, 29477, 29478, 29479, 29480, 29481, 29482,
            29483],
           dtype='int64', name='Index', length=11448)

In [115]:
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('dutch')
count = Counter([word for doc in sentence_stream for word in doc if word not in stop_words])
WordCloud(width=600, height=400, background_color='yellow').generate_from_frequencies(count).to_file('wordcloud.png')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jesse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
