In [126]:
from pathlib import Path
from top2vec import Top2Vec
import pickle
import numpy as np

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags

In [127]:
DATA_DIR = Path('./data')
def load(filename):
    f = open(DATA_DIR/filename,"rb")
    return pickle.load(f)
    
def save(data, filename):
    with open(DATA_DIR/filename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [128]:
def add_voor_tegen_index(df):
    print(len(df))
    stem_column = [c for c in df.columns if 'Stem_' in c and c != 'Stem_persoon']
    stem_column_adj = [c[5:] for c in df.columns if 'Stem_' in c and c != 'Stem_persoon']
    stem_array = df[stem_column].values.tolist()
    assert len(stem_array[0]) == len(stem_column)
    voor = [[stem_column_adj[i] for i, stem in enumerate(motie) if stem == 1] for motie in stem_array]
    tegen = [[stem_column_adj[i] for i, stem in enumerate(motie) if stem == 0] for motie in stem_array]
    df['Partijen_Voor'] = voor
    df['Partijen_Tegen'] = tegen
    df['Index']=list(range(len(df)))
    return df

In [129]:
def get_df(only_last_kamer):
    file = open("moties_processed_df.pickle","rb")
    df = pickle.load(file)
    print('before removal empty texts',len(df))

    # remove moties without text
    mask = (df['Text']=='') | (df['Text'].isna())
    df = df.loc[~mask]
    print('after removal empty texts',len(df))
    if only_last_kamer:
        df = df[df['Kabinet']==last_kamer]
        print('after selecting latest cabinet', len(df))
    df = add_voor_tegen_index(df)
    df['BesluitTekst'] = df['BesluitTekst'].str.replace('.','')
    return df, df['ClippedText'].values

## Model Training

In [130]:
only_last_kamer = False
last_kamer = 'Rutte IV'

if only_last_kamer:
    df, documents = get_df(True)
else:
    df, documents = get_df(False)
    
# got the code for making the bigram part from https://lppier.github.io/
sentence_stream = [simple_preprocess(strip_tags(doc), deacc=True) for doc in documents]
bigram = Phrases(sentence_stream, min_count=30)
bigram_phraser = Phraser(bigram)

indieners = {indiener[-1].lower() for indiener in df['Indiener_persoon'].str.split() if indiener}
years = {word for doc in sentence_stream for word in doc if word.startswith('x') and not word.startswith('xin')}
manual_stopwords = {'faber', 'lacin', 'kroger', 'iv', 'beschikt', 'die', 'vaststelling', 'lid','vi', 'viii', 'iii', 'iv', 'ii', 'i', 'kamer', 'regering tevens', 'regering', 'gehoord','beraadslaging', 'overweegt', 'overwegende', "gehoord","regering","gesproken","horen","mondkapjes","spreken","regeringen","gesprekken","commissiedebat","discussies","discussie","hoort","regeringsbeleid","besprekingen","overheden","overheid","overheids","overheidsbeleid","gesprek","zeggen","dialoog","geluid","uitspreekt","bureaucratische","spreekt","openbare","audiovisuele","gehoor","bestuursakkoord","stil","minuten","parlement","parlementen","officieel","bureaucratie","democratie","officiele","batenanalyse","parlementair","begrotings","governance","democratische","budgetrecht","democratisch","kabinetsreactie","belastingplan","wetsvoorstellen","wetsvoorstel","thema","dicht","risicoanalyse","bestuursrecht","bestuursorganen","european","nederlandsche","nederlanden","eurogroep","nederlander","nederland","holland","hogescholen","no","kinder","po","nee","verzoek","verzoeken","gevraagde","gevraagd","aangevraagd","verzoekt","claims","euro","eurocommissaris","vergunning","instemming","educatieve","eisen","voorstel","aanvrager","afgewezen","toepassing","voorstelt","aanvragen","vereist","werkloosheid","energietoeslag","ongewenste","aanvraag","behoefte","for","voorgesteld","weigert","automatisch","wachten","binnenkort","graag","gat","dji","mond","non","imago","toch","makkelijk","gestuurd","onmogelijk","nu","pas","gemakkelijk","co","eenvoudig","dende","natuurlijk","vervolg","uitzetten","zeker","opriep","budgettaire","fiscale","gefinancierde","financieel","gefinancierd","duizenden","financiele","financierings","ongekend","whw","ja","nertsen","zo","inderdaad","gevaarlijk","budget","begrotingen","budgetten","begroting","overgebleven","uitgaven","kostenbesparing","subsidies","overtreden","overtreding","onverminderd","overtredingen","weren","obstakels","afgeschaft","thans","rulings","uitsluiting","subsidieren","over","reduceren","overschrijden","onwenselijk","hinder","to","preventief","overschrijding","beperking","overtuiging","voorkomen","bestedingen","promoten","sanctie","verhinderen","nr","je","leeftijdsgrens","kleinere","wel","nibud","niet","novi","verzamelt" ,"fatsoen" ,"imams" ,"imam" ,"minuut" ,"fantastisch" ,"keurig" ,"terechtgekomen" ,"bik" ,"verschijnt" ,"belem" ,"bel" ,"vermo" ,"zins" ,"ima" ,"zoe" ,"sappen" ,"verdien" ,"kundig" ,"straks" ,"viteit" ,"mondig" ,"mooi" ,"letten" ,"combi" ,"geweldig" ,"sak" ,"bene" ,"faso" ,"wijde" ,"hoor" ,"verrekend" ,"prachtige" ,"fao" ,"teneinde" ,"capten" ,"openlijk" ,"openhouden" ,"tege" ,"simpele" ,"tak" ,"simpel" ,"korrels" ,"hww" ,"geleden" ,"meteen" ,"volstaat" ,"lees" ,"weet" ,"ver" ,"eenvoudiger" ,"wetswijzigingen" ,"inmiddels" ,"hbo" ,"ziet" ,"makkelijker" ,"zitten" ,"snelle" ,"drukke" ,"snel" ,"iack" ,"gekeken" ,"kijkt" ,"vindt" ,"dak" ,"btw" ,"lijken" ,"ots" ,"kijken" ,"gemakkelijker" ,"geacht" ,"eenvoudige" ,"zodra" ,"abro" ,"verzamelen" ,"gelukt" ,"goed" ,"mooie" ,"tenzij" ,"hakken" ,"grondwettelijke" ,"korte" ,"spoedig" ,"nh" ,"laat" ,"stank" ,"daaronder" ,"kortst" ,"klaar" ,"krap","uitstekende" ,"ben" ,"begrijpen","daarover","leggen","opties","hierover","toegevoegd","aangetoond","dat","drie","veilig","daarmee","kelijk","beschreven","weten","mogen","hen","gegaan" }
stopwords = indieners | years | manual_stopwords


def bigram_stopword_preprocess(doc):
    sentence_stream = simple_preprocess(strip_tags(doc), deacc=True)
    sentence_stream = [word for word in sentence_stream if word not in stopwords]
    return sentence_stream
    # return bigram_phraser[sentence_stream]

before removal empty texts 39245
after removal empty texts 39244
39244


In [131]:
df['Kamer'].value_counts()

Kamer
Rutte III        12178
Rutte II         10761
Rutte IV          9007
Rutte I           4766
Balkenende IV     2532
Name: count, dtype: int64

In [132]:
# https://github.com/scikit-learn-contrib/hdbscan/issues/607

model = Top2Vec(documents, speed='deep-learn', embedding_model='distiluse-base-multilingual-cased',workers=8, min_count=10, tokenizer=bigram_stopword_preprocess, ngram_vocab=False)
model.get_num_topics()
if only_last_kamer:
    model.save("data/doc2vec_deep_bigram_enhanced_stopwords_last_kamer")
else:
    model.save("data/doc2vec_deep_bigram_enhanced_stopwords_all")

2023-08-07 12:43:29,190 - top2vec - INFO - Pre-processing documents for training


2023-08-07 12:43:36,319 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model
2023-08-07 12:43:37,380 - top2vec - INFO - Creating joint document/word embedding
2023-08-07 13:14:37,197 - top2vec - INFO - Creating lower dimension embedding of documents
2023-08-07 13:14:59,269 - top2vec - INFO - Finding dense areas of documents
2023-08-07 13:15:02,063 - top2vec - INFO - Finding topics


## Reduce topic number to something more manageable

In [176]:
# For the manual analysis it's necessary to reduce the amount of topics, otherwise you will have 50 or so topics to investigate

df, documents = get_df(only_last_kamer)
if only_last_kamer:
    model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords_last_kamer")
else:
    model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords_all")

before removal empty texts 39245
after removal empty texts 39244
39244


In [177]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'{len(topic_nums)} topics. In total there are {sum(topic_sizes)} documents. These are the amount of documents per topic:\n{topic_sizes}')

168 topics. In total there are 39244 documents. These are the amount of documents per topic:
[1956 1878 1540 1518  870  726  691  675  674  649  640  599  587  550
  533  521  520  502  501  477  470  440  440  437  432  427  401  382
  380  375  372  367  346  340  329  318  310  303  298  287  284  282
  281  278  277  264  262  262  248  247  247  243  243  242  236  234
  227  226  218  217  217  209  198  188  187  182  180  177  173  163
  161  159  158  155  154  150  142  138  138  138  137  136  136  134
  131  128  124  121  119  119  118  114  114  111  108  107  105  105
  101  100  100  100   98   98   97   96   94   94   93   91   90   89
   88   86   84   81   80   79   78   78   77   73   73   73   73   71
   71   71   70   69   69   67   67   65   63   61   61   60   60   59
   57   56   56   55   55   53   53   53   52   52   50   50   50   49
   49   48   48   45   45   38   38   38   38   36   33   31   29   26]


In [147]:
# helper to check what happens if you reduce topics
def get_reduced_topics(num_topics):
    print(f'performing reduction to {num_topics} topics')
    reduced_topics = model.hierarchical_topic_reduction(num_topics)
    topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
    reduced_topics = tuple(tuple(sorted(t)) for t in reduced_topics)
    return reduced_topics, topic_words

def find_diff(reduced1, reduced2, reverse=False):
    # find the topics that where merged and return their index
    if not reverse:
        changed = set(reduced1) - set(reduced2)
        return [index for index, topic in enumerate(reduced1) if topic in changed]
    else:
        changed = set(reduced2) - set(reduced1)
        return [index for index, topic in enumerate(reduced2) if topic in changed]


def print_merge(large, small, num_words=50):
    print(f'\ninspecting difference from {len(large[0])} to {len(small[0])} topics')
    print('old topics')
    for i in find_diff(large[0],small[0]):
        print(large[1][i][:num_words])
    print('new topic')
    for i in find_diff(large[0],small[0], reverse=True):
        print(small[1][i][:num_words])

def find_optimal_num_topics(minn, maxx):
    start = minn
    stop = maxx
    reductions = {i: get_reduced_topics(i) for i in range(start, stop, -1)}
    for i in range(start, stop +1,-1):
        print(i)
        print_merge(reductions[i], reductions[i-1])
find_optimal_num_topics(17,12)

performing reduction to 17 topics
performing reduction to 16 topics
performing reduction to 15 topics
performing reduction to 14 topics
performing reduction to 13 topics
17

inspecting difference from 17 to 16 topics
old topics
['begrotingsjaar' 'agro' 'miljarden' 'miljoen' 'miljoenennota' 'miljard'
 'duizend' 'miljoenen' 'voltooien' 'rijksbegroting' 'apk' 'allang' 'kwh'
 'poms' 'kalenderjaar' 'mln' 'congo' 'tkkst' 'volstaan' 'belemme' 'imo'
 'enorm' 'super' 'hebt' 'aio' 'bak' 'mengen' 'ontzettend' 'boete' 'gekend'
 'drankenkartons' 'meireces' 'loo' 'vis' 'doch' 'mondkapje' 'wwz' 'zoveel'
 'atad' 'kei' 'gijs' 'massa' 'ww' 'ggo' 'ontwerpbesluit' 'sowieso'
 'vergemakkelijkt' 'verstaan' 'zoet' 'fijn']
['privacyregels' 'digitale' 'digital' 'cybersecurity' 'cyberveiligheid'
 'digitalisering' 'digitaal' 'cryptovaluta' 'openstaan' 'open' 'agro'
 'duizend' 'tweemaal' 'verstaan' 'drankenkartons' 'voltooien' 'illegaal'
 'moest' 'belemme' 'bezette' 'digitaliseren' 'winkeltijdenwet' 'klap'
 'ontze

In [178]:
# Make a choice to how many topics to reduce
num_topics = 17
reduced_topics = model.hierarchical_topic_reduction(num_topics)


In [149]:
topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
'" ,"'.join(t for t in topic_words[0])


'rechtsstaat" ,"agro" ,"mededingingswet" ,"stakingsrecht" ,"voltooien" ,"duizend" ,"allang" ,"legale" ,"winkeltijdenwet" ,"ontzettend" ,"apk" ,"mijnbouwwet" ,"wettig" ,"belemme" ,"volstaan" ,"overgangsrecht" ,"super" ,"uitstekend" ,"legalisering" ,"kwh" ,"inburgeringswet" ,"zachte" ,"terecht" ,"hebt" ,"fijn" ,"gijs" ,"invoeringswet" ,"mengen" ,"teelt" ,"vorig" ,"congo" ,"vis" ,"boete" ,"gekend" ,"leerplichtwet" ,"dik" ,"verstaan" ,"enorm" ,"imo" ,"indachtig" ,"massa" ,"volstrekt" ,"sowieso" ,"zoveel" ,"miljarden" ,"tkkst" ,"miljoenennota" ,"teniet" ,"meireces" ,"loo'

In [167]:
topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
for i, t in enumerate(topic_words):
    print(i, t)

0 ['begrotingsjaar' 'agro' 'miljarden' 'miljoen' 'miljoenennota' 'miljard'
 'duizend' 'miljoenen' 'voltooien' 'rijksbegroting' 'apk' 'allang' 'kwh'
 'poms' 'kalenderjaar' 'mln' 'congo' 'tkkst' 'volstaan' 'belemme' 'imo'
 'enorm' 'super' 'hebt' 'aio' 'bak' 'mengen' 'ontzettend' 'boete' 'gekend'
 'drankenkartons' 'meireces' 'loo' 'vis' 'doch' 'mondkapje' 'wwz' 'zoveel'
 'atad' 'kei' 'gijs' 'massa' 'ww' 'ggo' 'ontwerpbesluit' 'sowieso'
 'vergemakkelijkt' 'verstaan' 'zoet' 'fijn']
1 ['ziekenhuizen' 'verpleeghuizen' 'ziekenhuiszorg' 'verpleeghuis'
 'gezondheidszorg' 'ziekenhuis' 'medische' 'zorgstandaarden'
 'verpleegkundige' 'verpleeg' 'kliniek' 'medisch' 'klinieken'
 'zorgverzekeraar' 'zorgpersoneel' 'zorgkosten' 'zorgverzekering'
 'klinische' 'koortspatienten' 'ambulance' 'zorgautoriteit'
 'ambulancezorg' 'gezondheidsraad' 'klinisch' 'covidpatienten'
 'volksgezondheid' 'ambulances' 'kankerpatienten' 'coronapatienten'
 'huisartsenzorg' 'geneeskunde' 'patientgegevens' 'artsen'
 'diergenees

In [151]:
topic_words, word_scores, topic_nums = model.get_topics(reduced=False)
# for t in topic_words:
#     print(t)

# get some more stopwords to filter out
from collections import defaultdict
sums = defaultdict(int)

for t, s in zip(topic_words, word_scores):
    for word, score in zip(t,s):
        sums[word] += score + 1
    
scores = [(-score, word)  for word, score in sums.items()]
scores.sort()
'","'.join(s for w,s in scores if '_' not in s)

'migratiebeleid","rijksbegroting","politiesterkte","milieuwetgeving","immigratie","begrotingsbehan","begrotingsjaar","luchtvervuiling","ecoregelingen","migranten","begrotingssteun","buitengrenzen","oorlogsrecht","milieubelasting","afvalbeheerplan","energiebeleid","wereldoorlog","jaarverslag","energiecrisis","milieubeheer","islamisering","jeugdzorgaanbod","budgetplafonds","wapenexport","faunabeheer","verkeersregels","onderwijsbudget","moslimhaat","jihadistische","jihadisme","noordzeeakkoord","rechtsstaat","mededingingswet","paspoort","energiekosten","vluchtelingen","energieprijzen","vluchteling","ecoregeling","islamitisch","islamitische","jihadisten","politiewet","moslims","grondwet","energiemarkt","minderjarigen","veiligheidsraad","rijksuitgaven","dierenpolitie","bouwkosten","kernenergie","landbouwbeleid","energieakkoord","kolenbelasting","energielabels","landbouwakkoord","treinverkeer","milieuregels","spoorwegpolitie","vleessector","islam","schapen","tuinbouwsector","jeugdhulp","immig

In [152]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords= ['politie'] , num_topics=4, reduced=True)


In [182]:
topics = {
  0: 'Algemene zaken',  
  1: 'Zorg',  
  2: 'Gemeente',  
  3: 'Onderwijs',  
  4: 'Werk & Pensioen',  
  5: 'Milieu & Klimaat',  
  6: 'Landbouw & Dierenwelzijn',  
  7: 'Rechtsstaat',  
  8: 'Immigratie',  
  9: 'Vervoer',  
  10: 'Overig',  
  11: 'Financien',  
  12: 'Justitie',  
  13: 'Kabinetsbeleid',  
  14: 'Jeugd',  
  15: 'Energie',
  16: 'Digitalisering'
  }
# topics = {
#   0: 'Onderwijs',  
#   1: 'Buitenlandse zaken',  
#   2: 'Algemene zaken',  
#   3: 'Natuur & gaswinning',  
#   4: 'Landbouw & dierenwelzijn',  
#   5: 'Zorg',  
#   6: 'Sociale zaken',  
#   7: 'Justitie',  
#   8: 'Pensioenstelsel',  
#   9: 'Europese Unie',  
#   10: 'Klimaat & energie',  
#   11: 'Milieu & regelgeving',  
#   12: 'Zorg',  
#   13: 'Openbaar vervoer',  
#   14: 'Financiele sector',  
#   15: 'Wonen'
#   }
topic_words, word_scores, topic_nums = model.get_topics()
# topics = {i:' ,'.join(topic_words[i][:3]) for i in range(len(topic_words))}
doc_ids = list(range(len(documents)))
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=True)
topic_names = [topics[t] for t in topic_nums]
assert len(topic_nums) == len(df)
df['Topic'] = topic_names
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=False)
df['Topic_initial'] = topic_nums
df['Topic_score'] = topic_score
# df.sort_values(['Topic_initial', 'Topic_score'], ascending=False, inplace=True)


In [181]:
topics

{0: 'onderwijsniveau ,schoolverlaten ,schoolkosten',
 1: 'ziekenhuizen ,verpleeghuizen ,verpleeghuis',
 2: 'bouwkosten ,gemeentelijk ,woningbouw',
 3: 'pensioenstelsel ,pensioenopbouw ,pensioenakkoord',
 4: 'amnesty ,overtreders ,referendum',
 5: 'januari ,november ,april',
 6: 'landbouwsector ,landbouw ,landbouwbeleid',
 7: 'immigratie ,migranten ,migratiebeleid',
 8: 'miljoenennota ,miljoenen ,miljoen',
 9: 'begrotingsjaar ,kabinetsbesluit ,veranderd',
 10: 'jeugdzorgaanbod ,jeugdautoriteit ,jeugdzorg',
 11: 'belastingen ,tax ,belastingdienst',
 12: 'treinverkeer ,spoorvervoer ,treinverbinding',
 13: 'dierenwelzijns ,dierenwelzijn ,dierenpolitie',
 14: 'minister ,ministerraad ,ministeriele',
 15: 'waterbeheer ,waterveiligheid ,hervatten',
 16: 'kostendekkende ,kostendekkend ,kostenpost',
 17: 'kleinbedrijf ,kleine ,kleinschalig',
 18: 'gevangenisstraf ,strafrechtketen ,strafrechtelijk',
 19: 'vaccinatiegraad ,vaccinatie ,vaccineren',
 20: 'innovatiebox ,innovatiebeleid ,kenniseconomi

In [183]:
df['Topic'].unique()

array(['Immigratie', 'Zorg', 'Rechtsstaat', 'Gemeente', 'Overig',
       'Financien', 'Algemene zaken', 'Kabinetsbeleid', 'Jeugd',
       'Digitalisering', 'Landbouw & Dierenwelzijn', 'Justitie',
       'Werk & Pensioen', 'Vervoer', 'Milieu & Klimaat', 'Energie',
       'Onderwijs'], dtype=object)

## Optional add climate deepdive

In [170]:
reduced_topics

[[54, 41, 161, 133, 74, 22, 5, 8, 33, 23, 11, 16, 84, 115, 122, 29, 9],
 [148, 19, 111, 152, 56, 165, 1],
 [32, 40, 129, 108, 77, 2],
 [114, 156, 0],
 [43, 159, 3],
 [49, 39, 150, 78, 15, 71, 26, 46, 57, 95, 104, 28],
 [68,
  160,
  92,
  100,
  131,
  13,
  136,
  6,
  149,
  134,
  117,
  73,
  107,
  116,
  62,
  55,
  70,
  127,
  44],
 [103, 125, 140, 31, 24],
 [154,
  167,
  81,
  126,
  155,
  141,
  101,
  87,
  76,
  64,
  106,
  63,
  162,
  60,
  99,
  164,
  158,
  128,
  144,
  166,
  7],
 [145, 65, 139, 163, 35, 123, 135, 52, 90, 113, 69, 98, 110, 12],
 [75, 89, 96, 132, 130, 147, 4],
 [112, 138, 20, 118, 48, 27, 88, 61, 94, 30],
 [51, 83, 79, 142, 58, 137, 21, 53, 18],
 [37, 102, 34, 66, 14],
 [67, 119, 10, 91, 72, 97, 47, 80, 42],
 [121, 25, 105, 93, 50, 59, 120, 157, 109, 38],
 [45, 143, 153, 36, 86, 151, 82, 124, 146, 85, 17]]

In [184]:
topic_sizes, topic_nums = model.get_topic_sizes(reduced=False)
topic_words, word_scores, topic_nums = model.get_topics(reduced=False)
climate_idx = [topic_nums for t in topic_nums if t in reduced_topics[5]]
climate_idx


[array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167]),
 array([  0,   1,   2,   3,   4,   5,   6,   7,   8,  

In [185]:
topic_sizes, topic_nums = model.get_topic_sizes(reduced=False)
topic_words, word_scores, topic_nums = model.get_topics(reduced=False)
climate_idx = [topic_nums for t in topic_nums if t in reduced_topics[10]]
climate_subtopics = {128: 'Afhankelijkheid fossiele brandstoffen',
 165: 'CO2 reductie',
 5: 'Voldoen aan Parijs',
 141: 'Electriciteit',
 126: 'Groningen',
 205: 'Zonnepanelen',
 96: 'Energierekening betalen',
 236: 'Biomassa',
 105: 'Kolencentrales',
 239: 'Windturbines - overlast',
 29: 'Windturbines - subsidie'}

In [186]:
df['Klimaat'] = df.loc[df['Topic']=='Klimaat & energie', 'Topic_initial'].map(climate_subtopics)

In [187]:
save(df, 'df_including_topics_full.pickle')
save(model, 'doc2vec_deep_bigram_enhanced_stopwords_rutteIV_reduced')

# Prepare slimmed down versions for production

In [157]:
df = load("df_including_topics_full.pickle")
# df, documents = get_df(only_last_kamer)
# df['BesluitTekst'] = df['BesluitTekst'].str.replace('.','')

print(len(df))

if only_last_kamer:
    model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords_last_kamer")
else:
    model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords_all")
    
doc_ids = list(range(len(df)))
# don't sort the df before this operation
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=False)
topics = [', '.join(topic_words[i][:3]) for i in range(len(topic_words))]

df['Topic_initial'] = topics
df['Topic_score'] = topic_score
df.sort_values(['Topic_initial', 'Topic_score'], ascending=False, inplace=True)
# important do this only after all row filters have been set
df = df[df['Kamer']== last_kamer]
df.index = df['Index']
print(len(df))
stem_column = [c for c in df.columns if 'Stem_' in c and c != 'Stem_persoon']
required_cols = ['Kamer', 'Jaar','Indienende_partij', 'BesluitSoort','BesluitTekst','Topic_initial', 'Topic_score','Indienende_persoon_partij','Partijen_Voor', 'Partijen_Tegen', 'Text']

# streamlit has problems with category type: 
# https://github.com/streamlit/streamlit/issues/47
for col in stem_column + required_cols[:-4]:
    df[col] = df[col].astype('category')

df = df[stem_column + required_cols]
save(df, "df_production.pickle")


39244
9007


In [158]:
def delete_documents(self, doc_ids):
    """
    Delete documents from current model.
    Warning: If document ids were not used in original model, deleting
    documents will change the indexes and therefore doc_ids.
    The documents will be deleted from the current model without changing
    existing document, word and topic vectors. Topic sizes will be updated.
    If deleting a large quantity of documents relative to the current model
    size a new model should be trained for best results.
    Parameters
    ----------
    doc_ids: List of str, int
        A unique value per document that is used for referring to documents
        in search results.
    """
    # make sure documents exist
    self._validate_doc_ids(doc_ids, doc_ids_neg=[])

    # update index
    if self.documents_indexed:
        # delete doc_ids from index
        index_ids = [self.doc_id2index_id(doc_id) for doc_id in doc_ids]
        for index_id in index_ids:
            self.document_index.mark_deleted(index_id)
        # update index_id and doc_ids
        for doc_id in doc_ids:
            self.doc_id2index_id.pop(doc_id)
        for index_id in index_ids:
            self.index_id2doc_id.pop(index_id)

    # get document indexes from ids
    doc_indexes = self._get_document_indexes(doc_ids)

    # delete documents
    if self.documents is not None:
        self.documents = np.delete(self.documents, doc_indexes, 0)

    # delete document ids
    if self.document_ids is not None:
        for doc_id in doc_ids:
            self.doc_id2index.pop(doc_id)
        keys = list(self.doc_id2index.keys())
        self.document_ids = np.array(keys)
        values = list(range(0, len(self.doc_id2index.values())))
        self.doc_id2index = dict(zip(keys, values))

    # delete document vectors
    self._set_document_vectors(np.delete(self._get_document_vectors(norm=False), doc_indexes, 0))

    if self.embedding_model == 'doc2vec':
        num_docs = len(doc_indexes)
        self.model.docvecs.count -= num_docs
        self.model.docvecs.max_rawint -= num_docs
        self.model.docvecs.vectors_docs_norm = None
        self.model.docvecs.init_sims()

    # update topics
    # self._unassign_documents_from_topic(doc_indexes, hierarchy=False)

    if self.hierarchy is not None:
        self._unassign_documents_from_topic(doc_indexes, hierarchy=True)
model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords_rutteIV_reduced")

# model.delete_documents = delete_documents
df = load("df_including_topics_full.pickle")


In [159]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'{len(topic_nums)} topics. In total there are {sum(topic_sizes)} documents. These are the amount of documents per topic:\n{topic_sizes}')

topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
    
for num, size, word in zip(topic_nums, topic_sizes, topic_words):
    print(num, word[:3], size)
print(sum(topic_sizes))

168 topics. In total there are 39244 documents. These are the amount of documents per topic:
[1956 1878 1540 1518  870  726  691  675  674  649  640  599  587  550
  533  521  520  502  501  477  470  440  440  437  432  427  401  382
  380  375  372  367  346  340  329  318  310  303  298  287  284  282
  281  278  277  264  262  262  248  247  247  243  243  242  236  234
  227  226  218  217  217  209  198  188  187  182  180  177  173  163
  161  159  158  155  154  150  142  138  138  138  137  136  136  134
  131  128  124  121  119  119  118  114  114  111  108  107  105  105
  101  100  100  100   98   98   97   96   94   94   93   91   90   89
   88   86   84   81   80   79   78   78   77   73   73   73   73   71
   71   71   70   69   69   67   67   65   63   61   61   60   60   59
   57   56   56   55   55   53   53   53   52   52   50   50   50   49
   49   48   48   45   45   38   38   38   38   36   33   31   29   26]
0 ['rechtsstaat' 'agro' 'mededingingswet'] 1956
1 ['bo

In [160]:
model.delete_documents(list(range(len(df))))
model.delete_documents = 1
model.save("data/doc2vec_production")

## Check app

In [30]:
df = load("df_production.pickle")
model = Top2Vec.load("data/doc2vec_production")
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords= ['politie'] , num_topics=4, reduced=True)
topic_nums

array([ 9, 11, 12,  8], dtype=int64)

In [31]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'{len(topic_nums)} topics. In total there are {sum(topic_sizes)} documents. These are the amount of documents per topic:\n{topic_sizes}')

170 topics. In total there are 0 documents. These are the amount of documents per topic:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [315]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords= ['politie'] , num_topics=4, reduced=True)
topic_nums

array([44, 23, 13, 15], dtype=int64)

In [122]:
topic_words, word_scores, topic_nums = model.get_topics(num_topics=47)


In [125]:
topic_words[45]

array(['politieagenten', 'politietop', 'politiemensen', 'politiewet',
       'politieacademie', 'polisaanbod', 'politie', 'polissen', 'cop',
       'zedenpolitie', 'overheidsbeleid', 'regeringsbeleid',
       'kabinetsbeleid', 'georganiseerde_criminaliteit',
       'openbaar_ministerie', 'asielbeleid', 'nationale_veiligheid',
       'bureaucratische', 'autoriteiten', 'landenbeleid', 'beleidskader',
       'rijksbeleid', 'bureaucratie', 'beleidsopties',
       'ministeriele_regeling', 'politici', 'loonbeleid', 'beleidsinzet',
       'administratieve_lasten', 'sanctiebeleid', 'decentrale_overheden',
       'politieke_partijen', 'politiek', 'politieke', 'beleids',
       'agenten', 'beleid', 'beleidskeuze', 'industriebeleid',
       'beleidskeuzes', 'preventiebeleid', 'beleidsnota', 'agentschap',
       'beleidsregels', 'bestuursorganen', 'beleidsmatige',
       'beleidsvrijheid', 'openbare_orde', 'nederlandse_zorgautoriteit',
       'beleidslijn'], dtype='<U28')

In [141]:
df = load("df_production.pickle")
len(df)

6017

In [144]:
df[(df['Topic_initial']=='verduurzamen') & (df['Kamer']== last_kamer)]['Text'].iloc[0]

'2\nTweede Kamer der Staten-Generaal\nVergaderjaar 2021–2022\n32 813 Kabinetsaanpak Klimaatbeleid\nNr. 1013 MOTIE VAN DE LEDEN BONTENBAL EN GRINWIS\nVoorgesteld 19 april 2022\nDe Kamer,\ngehoord de beraadslaging,\noverwegende dat de verduurzaming van de gebouwde omgeving de\nkomende jaren vooral gefocust moet zijn op het verduurzamen van de\nwarmtevoorziening in de gebouwde omgeving, resulterend in een\nreductie van het aardgasverbruik;\nverzoekt de regering om in het beleid voor de verduurzaming van de\ngebouwde omgeving ook een subdoelstelling voor aardgasreductie op te\nnemen,\nen gaat over tot de orde van de dag.\nBontenbal\nGrinwis\nkst-32813-1013ISSN\n0921 - 7371\n’s-Gravenhage 2022 Tweede Kamer, vergaderjaar 2021–2022, 32 813, nr. 1013'

In [None]:
df = load("df_including_topics_full.pickle")
print(len(df))
df['BesluitTekst'].value_counts()

6017


Aangenomen    3296
Verworpen     2721
Name: BesluitTekst, dtype: int64