In [260]:
from pathlib import Path
from top2vec import Top2Vec
import pickle
import numpy as np

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags

In [261]:
DATA_DIR = Path('./data')
def load(filename):
    f = open(DATA_DIR/filename,"rb")
    return pickle.load(f)
    
def save(data, filename):
    with open(DATA_DIR/filename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [262]:
def add_voor_tegen_index(df):
    print(len(df))
    stem_column = [c for c in df.columns if 'Stem_' in c and c != 'Stem_persoon']
    stem_column_adj = [c[5:] for c in df.columns if 'Stem_' in c and c != 'Stem_persoon']
    stem_array = df[stem_column].values.tolist()
    assert len(stem_array[0]) == len(stem_column)
    voor = [[stem_column_adj[i] for i, stem in enumerate(motie) if stem == 1] for motie in stem_array]
    tegen = [[stem_column_adj[i] for i, stem in enumerate(motie) if stem == 0] for motie in stem_array]
    df['Partijen_Voor'] = voor
    df['Partijen_Tegen'] = tegen
    df['Index']=list(range(len(df)))
    return df

In [263]:

file = open("moties_for_llm.pickle","rb")
df = pickle.load(file)
print('before removal empty texts',len(df))

# remove moties without text
mask = (df['Text']=='') | (df['Text'].isna())
df = df.loc[~mask]
print('after removal empty texts',len(df))

df = df[df['Kabinet']=='Rutte IV']
print('after selecting latest cabinet', len(df))
df = add_voor_tegen_index(df)
documents = df['ClippedText'].values
df['BesluitTekst'] = df['BesluitTekst'].str.replace('.','')

before removal empty texts 6017
after removal empty texts 6017
after selecting latest cabinet 6017
6017


  df['BesluitTekst'] = df['BesluitTekst'].str.replace('.','')


## Model Training

In [227]:
# got the code for making the bigram part from https://lppier.github.io/
sentence_stream = [simple_preprocess(strip_tags(doc), deacc=True) for doc in documents]
bigram = Phrases(sentence_stream, min_count=10)
bigram_phraser = Phraser(bigram)

indieners = {indiener[-1].lower() for indiener in df['Indiener_persoon'].str.split() if indiener}
years = {word for doc in sentence_stream for word in doc if word.startswith('x') and not word.startswith('xin')}
manual_stopwords = {'faber', 'lacin', 'kroger', 'iv', 'beschikt', 'die', 'vaststelling', 'lid','vi', 'viii', 'iii', 'iv', 'ii', 'i', 'kamer', 'regering tevens', 'regering', 'gehoord','beraadslaging'}
stopwords = indieners | years | manual_stopwords
   
def bigram_stopword_preprocess(doc):
    sentence_stream = simple_preprocess(strip_tags(doc), deacc=True)
    sentence_stream = [word for word in sentence_stream if word not in stopwords]
    return bigram_phraser[sentence_stream]

In [228]:
# https://github.com/scikit-learn-contrib/hdbscan/issues/607
model = Top2Vec(documents, speed='deep-learn', tokenizer=bigram_stopword_preprocess, embedding_model='distiluse-base-multilingual-cased',workers=8, min_count=5, ngram_vocab=False)
model.get_num_topics()

# removed 

2023-08-04 11:32:29,496 - top2vec - INFO - Pre-processing documents for training


2023-08-04 11:32:31,006 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model
2023-08-04 11:32:32,141 - top2vec - INFO - Creating joint document/word embedding


KeyboardInterrupt: 

In [157]:
model.save("data/doc2vec_deep_bigram_enhanced_stopwords_rutteIV")

## Reduce topic number to something more manageable

In [303]:

file = open("moties_for_llm.pickle","rb")
df = pickle.load(file)
print('before removal empty texts',len(df))

# remove moties without text
mask = (df['Text']=='') | (df['Text'].isna())
df = df.loc[~mask]
print('after removal empty texts',len(df))

df = df[df['Kabinet']=='Rutte IV']
print('after selecting latest cabinet', len(df))
df = add_voor_tegen_index(df)
documents = df['ClippedText'].values
df['BesluitTekst'] = df['BesluitTekst'].str.replace('.','')
model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords_rutteIV")
model.get_num_topics()

before removal empty texts 6017
after removal empty texts 6017
after selecting latest cabinet 6017
6017


  df['BesluitTekst'] = df['BesluitTekst'].str.replace('.','')


47

In [304]:
topic_sizes, topic_nums = model.get_topic_sizes()

In [305]:
def get_reduced_topics(num_topics):
    print(f'performing reduction to {num_topics} topics')
    reduced_topics = model.hierarchical_topic_reduction(num_topics)
    topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
    reduced_topics = tuple(tuple(sorted(t)) for t in reduced_topics)
    return reduced_topics, topic_words

def find_diff(reduced1, reduced2, reverse=False):
    # find the topics that where merged and return their index
    if not reverse:
        changed = set(reduced1) - set(reduced2)
        return [index for index, topic in enumerate(reduced1) if topic in changed]
    else:
        changed = set(reduced2) - set(reduced1)
        return [index for index, topic in enumerate(reduced2) if topic in changed]


def print_merge(large, small, num_words=50):
    print(f'\ninspecting difference from {len(large[0])} to {len(small[0])} topics')
    print('old topics')
    for i in find_diff(large[0],small[0]):
        print(large[1][i][:num_words])
    print('new topic')
    for i in find_diff(large[0],small[0], reverse=True):
        print(small[1][i][:num_words])

def find_optimal_num_topics():
    start = 46
    stop = 35
    reductions = {i: get_reduced_topics(i) for i in range(start, stop, -1)}
    for i in range(start, stop +1,-1):
        print(i)
        print_merge(reductions[i], reductions[i-1])
# find_optimal_num_topics()

In [306]:
# Make a choice to how many topics to reduce
num_topics = 46
reduced_topics = model.hierarchical_topic_reduction(num_topics)


In [307]:
# inspect number of documents for each topic
# topic_sizes, topic_nums = model.get_topic_sizes(reduced=True)
topic_words, word_scores, topic_nums = model.get_topics(reduced=True)

In [308]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords= ['politie'] , num_topics=4, reduced=True)
topic_nums

array([20, 23, 36, 38], dtype=int64)

In [309]:
len(topic_nums)

4

In [310]:
topics = {
  0: 'Onderwijs',  
  1: 'Buitenlandse zaken',  
  2: 'Algemene zaken',  
  3: 'Natuur & gaswinning',  
  4: 'Landbouw & dierenwelzijn',  
  5: 'Zorg',  
  6: 'Sociale zaken',  
  7: 'Justitie',  
  8: 'Pensioenstelsel',  
  9: 'Europese Unie',  
  10: 'Klimaat & energie',  
  11: 'Milieu & regelgeving',  
  12: 'Zorg',  
  13: 'Openbaar vervoer',  
  14: 'Financiele sector',  
  15: 'Wonen'
  }
topic_words, word_scores, topic_nums = model.get_topics()
topics = {i:' '.join(topic_words[i][:3]) for i in range(len(topic_words))}
doc_ids = list(range(len(documents)))
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=True)
topic_names = [topics[t] for t in topic_nums]
assert len(topic_nums) == len(df)
df['Topic'] = topic_names
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=False)
df['Topic_initial'] = topic_nums
df['Topic_score'] = topic_score
# df.sort_values(['Topic_initial', 'Topic_score'], ascending=False, inplace=True)


## Optional add climate deepdive

In [48]:
topic_sizes, topic_nums = model.get_topic_sizes(reduced=False)
topic_words, word_scores, topic_nums = model.get_topics(reduced=False)
climate_idx = [topic_nums for t in topic_nums if t in reduced_topics[10]]
climate_subtopics = {128: 'Afhankelijkheid fossiele brandstoffen',
 165: 'CO2 reductie',
 5: 'Voldoen aan Parijs',
 141: 'Electriciteit',
 126: 'Groningen',
 205: 'Zonnepanelen',
 96: 'Energierekening betalen',
 236: 'Biomassa',
 105: 'Kolencentrales',
 239: 'Windturbines - overlast',
 29: 'Windturbines - subsidie'}

In [49]:
df['Klimaat'] = df.loc[df['Topic']=='Klimaat & energie', 'Topic_initial'].map(climate_subtopics)

In [311]:
save(df[:1000], 'df_including_topics.pickle')
save(df, 'df_including_topics_full.pickle')
save(model, 'doc2vec_deep_bigram_enhanced_stopwords_rutteIV_reduced')


In [167]:
len(df)

6017

# Prepare slimmed down versions for production

In [312]:
df = load("df_including_topics_full.pickle")
df['BesluitTekst'] = df['BesluitTekst'].str.replace('.','')

print(len(df))

model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords_rutteIV_reduced")
reduced_topics = model.hierarchical_topic_reduction(10)

doc_ids = list(range(len(df)))
# don't sort the df before this operation
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=True)
topics = [t[0] for t in topic_words]

df['Topic_initial'] = topics
df['Topic_score'] = topic_score
df.sort_values(['Topic_initial', 'Topic_score'], ascending=False, inplace=True)



df = df[df['Kamer']=='Rutte IV']
print(len(df))
# important do this only after all row filters have been set
df.index = df['Index']


stem_column = [c for c in df.columns if 'Stem_' in c and c != 'Stem_persoon']
required_cols = ['Kamer', 'Jaar','Indienende_partij', 'BesluitSoort','BesluitTekst','Topic_initial', 'Topic_score','Indienende_persoon_partij','Partijen_Voor', 'Partijen_Tegen', 'Text']

# streamlit has problems with category type: 
# https://github.com/streamlit/streamlit/issues/47
# for col in stem_column + required_cols[:-4]:
#     df[col] = df[col].astype('category')

df = df[stem_column + required_cols]
save(df, "df_production.pickle")


  df['BesluitTekst'] = df['BesluitTekst'].str.replace('.','')


6017
6017


In [313]:

topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords= ['politie'] , num_topics=4, reduced=True)
topic_nums

array([8, 7, 6, 9], dtype=int64)

In [314]:
def delete_documents(self, doc_ids):
    """
    Delete documents from current model.
    Warning: If document ids were not used in original model, deleting
    documents will change the indexes and therefore doc_ids.
    The documents will be deleted from the current model without changing
    existing document, word and topic vectors. Topic sizes will be updated.
    If deleting a large quantity of documents relative to the current model
    size a new model should be trained for best results.
    Parameters
    ----------
    doc_ids: List of str, int
        A unique value per document that is used for referring to documents
        in search results.
    """
    # make sure documents exist
    self._validate_doc_ids(doc_ids, doc_ids_neg=[])

    # update index
    if self.documents_indexed:
        # delete doc_ids from index
        index_ids = [self.doc_id2index_id(doc_id) for doc_id in doc_ids]
        for index_id in index_ids:
            self.document_index.mark_deleted(index_id)
        # update index_id and doc_ids
        for doc_id in doc_ids:
            self.doc_id2index_id.pop(doc_id)
        for index_id in index_ids:
            self.index_id2doc_id.pop(index_id)

    # get document indexes from ids
    doc_indexes = self._get_document_indexes(doc_ids)

    # delete documents
    if self.documents is not None:
        self.documents = np.delete(self.documents, doc_indexes, 0)

    # delete document ids
    if self.document_ids is not None:
        for doc_id in doc_ids:
            self.doc_id2index.pop(doc_id)
        keys = list(self.doc_id2index.keys())
        self.document_ids = np.array(keys)
        values = list(range(0, len(self.doc_id2index.values())))
        self.doc_id2index = dict(zip(keys, values))

    # delete document vectors
    self._set_document_vectors(np.delete(self._get_document_vectors(norm=False), doc_indexes, 0))

    if self.embedding_model == 'doc2vec':
        num_docs = len(doc_indexes)
        self.model.docvecs.count -= num_docs
        self.model.docvecs.max_rawint -= num_docs
        self.model.docvecs.vectors_docs_norm = None
        self.model.docvecs.init_sims()

    # update topics
    # self._unassign_documents_from_topic(doc_indexes, hierarchy=False)

    if self.hierarchy is not None:
        self._unassign_documents_from_topic(doc_indexes, hierarchy=True)
model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords_rutteIV_reduced")

# model.delete_documents = delete_documents
df = load("df_including_topics_full.pickle")
model.delete_documents(list(range(len(df))))
model.delete_documents = 1
model.save("data/doc2vec_production")

In [315]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords= ['politie'] , num_topics=4, reduced=True)
topic_nums

array([44, 23, 13, 15], dtype=int64)

In [316]:
df = load("df_production.pickle")
model = Top2Vec.load("data/doc2vec_production")
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords= ['politie'] , num_topics=4, reduced=True)
topic_nums

array([44, 23, 13, 15], dtype=int64)

In [122]:
topic_words, word_scores, topic_nums = model.get_topics(num_topics=47)


In [125]:
topic_words[45]

array(['politieagenten', 'politietop', 'politiemensen', 'politiewet',
       'politieacademie', 'polisaanbod', 'politie', 'polissen', 'cop',
       'zedenpolitie', 'overheidsbeleid', 'regeringsbeleid',
       'kabinetsbeleid', 'georganiseerde_criminaliteit',
       'openbaar_ministerie', 'asielbeleid', 'nationale_veiligheid',
       'bureaucratische', 'autoriteiten', 'landenbeleid', 'beleidskader',
       'rijksbeleid', 'bureaucratie', 'beleidsopties',
       'ministeriele_regeling', 'politici', 'loonbeleid', 'beleidsinzet',
       'administratieve_lasten', 'sanctiebeleid', 'decentrale_overheden',
       'politieke_partijen', 'politiek', 'politieke', 'beleids',
       'agenten', 'beleid', 'beleidskeuze', 'industriebeleid',
       'beleidskeuzes', 'preventiebeleid', 'beleidsnota', 'agentschap',
       'beleidsregels', 'bestuursorganen', 'beleidsmatige',
       'beleidsvrijheid', 'openbare_orde', 'nederlandse_zorgautoriteit',
       'beleidslijn'], dtype='<U28')

In [141]:
df = load("df_production.pickle")
len(df)

6017

In [144]:
df[(df['Topic_initial']=='verduurzamen') & (df['Kamer']=='Rutte IV')]['Text'].iloc[0]

'2\nTweede Kamer der Staten-Generaal\nVergaderjaar 2021–2022\n32 813 Kabinetsaanpak Klimaatbeleid\nNr. 1013 MOTIE VAN DE LEDEN BONTENBAL EN GRINWIS\nVoorgesteld 19 april 2022\nDe Kamer,\ngehoord de beraadslaging,\noverwegende dat de verduurzaming van de gebouwde omgeving de\nkomende jaren vooral gefocust moet zijn op het verduurzamen van de\nwarmtevoorziening in de gebouwde omgeving, resulterend in een\nreductie van het aardgasverbruik;\nverzoekt de regering om in het beleid voor de verduurzaming van de\ngebouwde omgeving ook een subdoelstelling voor aardgasreductie op te\nnemen,\nen gaat over tot de orde van de dag.\nBontenbal\nGrinwis\nkst-32813-1013ISSN\n0921 - 7371\n’s-Gravenhage 2022 Tweede Kamer, vergaderjaar 2021–2022, 32 813, nr. 1013'

In [None]:
df = load("df_including_topics_full.pickle")
print(len(df))
df['BesluitTekst'].value_counts()

6017


Aangenomen    3296
Verworpen     2721
Name: BesluitTekst, dtype: int64

## Backup

In [None]:
necessary = load("df_production.pickle")
necessary_idx = set(necessary.index)

full = load("df_including_topics_full.pickle")
full_idx = set(full['Index'])

redundant_idx = full_idx - necessary_idx
assert len(necessary_idx) + len(redundant_idx) == len(full_idx)

In [None]:
model.get_topic_sizes()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0], dtype=int64),
 array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
  

In [None]:
model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords")
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num =3, num_docs= 100)
document_ids[:10]

array([22728,  9991,  5958,  2969,  6021,  8364, 19787,  8385,  8357,
       16140])

In [None]:
to_remove_3 = full[(full['Topic_initial']==3)&(full['Index'].isin(redundant_idx))]['Index']

In [None]:
len(list(to_remove_3))

278

In [None]:
model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords")
print(model.get_topic_sizes()[0])
model.delete_documents(list(to_remove_3))
print(model.get_topic_sizes()[0])

[531 458 445 404 401 350 347 333 326 312 309 308 301 287 285 278 276 273
 270 265 263 257 255 240 236 230 230 228 227 222 220 211 205 205 203 199
 197 195 193 192 190 182 181 180 175 174 171 169 166 165 164 163 161 159
 159 159 158 156 155 154 154 151 151 150 150 148 147 143 141 141 136 134
 133 133 132 131 129 127 126 125 124 124 122 121 120 118 118 118 117 116
 114 113 112 110 109 108 108 107 107 105 105 103 103 103 102 100  99  98
  98  97  96  96  96  96  96  96  95  95  95  94  93  91  91  91  89  88
  88  88  87  87  86  86  86  85  85  84  84  84  83  83  81  81  81  81
  80  80  80  80  79  79  79  79  79  79  78  77  76  75  75  75  75  74
  73  72  72  71  71  70  70  69  69  68  68  68  67  67  67  67  65  65
  65  64  63  62  62  61  60  60  59  59  58  58  57  57  57  56  56  56
  56  55  55  55  54  54  53  52  52  52  52  52  50  50  49  48  47  47
  46  46  46  45  45  45  45  45  45  44  44  43  43  42  41  41  40  40
  39  38  38  38  37  36  35  34  34  33  33  32  2

In [None]:
# model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords")
document_scores, document_ids = model.search_documents_by_topic(topic_num =3, num_docs= 404,return_documents=False)


ValueError: Invalid number of documents: original topic 3 only has 0 documents.

In [None]:
model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords")
sums, _ = model.get_topic_sizes()
sum(sums)

29484

In [None]:
documents[10]

'2\nTweede Kamer der Staten-Generaal\nVergaderjaar 2019–2020 \n35 420 Noodpakket banen en economie \nNr. 66   MOTIE VAN DE LEDEN BAUDET EN VAN HAGA \nVoorgesteld 28 mei 2020 \nDe Kamer, \ngehoord de beraadslaging, \nconstaterende dat in 2019 de Klimaatwet in werking is getreden; \noverwegende dat de kosten van het Nederlandse klimaatbeleid naar \nverwachting zullen oplopen tot in totaal 1.000 miljard euro; \noverwegende dat het Nederlandse klimaatbeleid, zelfs als de theorie over \nopwarming van de aarde door toedoen van de mens zou kloppen, die \nopwarming met hoogstens 0,00007 graden zou beperken; \noverwegende dat de kosten van het klimaatbeleid in geen enkele \nverhouding staan tot de opbrengsten; \nroept de regering op, om het klimaatbeleid te staken en het geld dat \nhierdoor bespaard wordt in te zetten om de klappen van de coronacrisis \nop te vangen, \nen gaat over tot de orde van de dag. \nBaudet \nVan Haga\n \n \n \n \nkst-35420-66\nISSN 0921 - 7371\n’s-Gravenhage 2020 Tweede

In [None]:
len(necessary[necessary['Topic_initial']==3])

126

In [None]:
df = load('df_production.pickle')

In [None]:
df['Jaar'].unique()

array([2020, 2019, 2018, 2017], dtype=int64)

In [None]:
['2017', '2018', '2019', '2020']