In [1]:
from pathlib import Path
from top2vec import Top2Vec
import pickle
import numpy as np

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags

In [2]:
DATA_DIR = Path('./data')
def load(filename):
    f = open(DATA_DIR/filename,"rb")
    return pickle.load(f)
    
def save(data, filename):
    with open(DATA_DIR/filename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:

file = open("moties_processed_df.pickle","rb")
df = pickle.load(file)
print('before removal empty texts',len(df))

# remove moties without text
mask = (df['Text']=='') | (df['Text'].isna())
df = df.loc[~mask]
print('after removal empty texts',len(df))

documents = df['Text'].values

before removal empty texts 29514
after removal empty texts 29484


In [18]:
# got the code for making the bigram part from https://lppier.github.io/
sentence_stream = [simple_preprocess(strip_tags(doc), deacc=True) for doc in documents]
bigram = Phrases(sentence_stream, min_count=10)
bigram_phraser = Phraser(bigram)

indieners = {indiener[-1].lower() for indiener in df['Indiener_persoon'].str.split() if indiener}
years = {word for doc in sentence_stream for word in doc if word.startswith('x') and not word.startswith('xin')}
manual_stopwords = {'faber', 'lacin', 'kroger', 'iv', 'beschikt', 'die', 'vaststelling', 'lid','vi', 'viii', 'iii', 'iv', 'ii', 'i'}
stopwords = indieners | years | manual_stopwords
   
def bigram_stopword_preprocess(doc):
    sentence_stream = simple_preprocess(strip_tags(doc), deacc=True)
    sentence_stream = [word for word in sentence_stream if word not in stopwords]
    return bigram_phraser[sentence_stream]

In [19]:
model = Top2Vec(documents, speed='deep-learn', tokenizer=bigram_stopword_preprocess, workers=4)
model.get_num_topics()

2021-03-03 12:21:44,496 - top2vec - INFO - Pre-processing documents for training
2021-03-03 12:23:30,780 - top2vec - INFO - Creating joint document/word embedding
2021-03-03 13:16:48,205 - top2vec - INFO - Creating lower dimension embedding of documents
2021-03-03 13:18:06,879 - top2vec - INFO - Finding dense areas of documents
2021-03-03 13:18:13,374 - top2vec - INFO - Finding topics


247

In [20]:
# model.save("data/doc2vec_deep_bigram_enhanced_stopwords")

In [4]:
model = Top2Vec.load("data/doc2vec_deep_bigram_enhanced_stopwords")
model.get_num_topics()

247

In [5]:
hash(model)

-9223371931130744909

In [7]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=['klimaat'], num_topics=5)
hash(model)

-9223371931130744909

In [21]:
def get_reduced_topics(num_topics):
    print(f'performing reduction to {num_topics} topics')
    reduced_topics = model.hierarchical_topic_reduction(num_topics)
    topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
    reduced_topics = tuple(tuple(sorted(t)) for t in reduced_topics)
    return reduced_topics, topic_words

def find_diff(reduced1, reduced2, reverse=False):
    # find the topics that where merged and return their index
    if not reverse:
        changed = set(reduced1) - set(reduced2)
        return [index for index, topic in enumerate(reduced1) if topic in changed]
    else:
        changed = set(reduced2) - set(reduced1)
        return [index for index, topic in enumerate(reduced2) if topic in changed]


def print_merge(large, small, num_words=50):
    print(f'\ninspecting difference from {len(large[0])} to {len(small[0])} topics')
    print('old topics')
    for i in find_diff(large[0],small[0]):
        print(large[1][i][:num_words])
    print('new topic')
    for i in find_diff(large[0],small[0], reverse=True):
        print(small[1][i][:num_words])
start = 17
stop = 6
reductions = {i: get_reduced_topics(i) for i in range(start, stop, -1)}
for i in range(start, stop +1,-1):
    print(i)
    print_merge(reductions[i], reductions[i-1])

In [5]:

# Make a choice to how many topics to reduce
num_topics = 16
reduced_topics = model.hierarchical_topic_reduction(num_topics)


In [6]:
# inspect number of documents for each topic
topic_sizes, topic_nums = model.get_topic_sizes(reduced=True)
topic_words, word_scores, topic_nums = model.get_topics(reduced=True)

ValueError: Hierarchical topic reduction has not been performed.

In [7]:
for i, t in enumerate(topic_words):
    print(i,t)

0 ['onderwijs_cultuur' 'wetenschap_viii' 'onderwijs' 'leerlingen'
 'voortgezet_onderwijs' 'viii' 'scholen' 'primair_onderwijs' 'mbo'
 'volwassenen_educatie' 'leerling' 'hoger_onderwijs' 'passend_onderwijs'
 'studenten' 'leraren' 'diploma' 'docenten' 'hbo' 'vakken' 'onderwijsveld'
 'funderend_onderwijs' 'docent' 'talenten' 'school' 'opleiding' 'celik'
 'vo' 'mbo_studenten' 'opleidingen' 'schooljaar' 'basisonderwijs'
 'hogescholen' 'curriculum' 'vmbo' 'havo' 'student' 'eindtoets'
 'bekostiging' 'leraar' 'voortgezet_speciaal' 'wet_educatie' 'doorstroom'
 'lerarentekort' 'middelbaar' 'schooladvies' 'wetenschap' 'werkdruk'
 'speciaal_onderwijs' 'klas' 'expertisecentra']
1 ['buitenlandse_zaken' 'jbz_raad' 'landen' 'vn' 'humanitaire'
 'buitenlandse_handel' 'mensenrechten' 'afghanistan' 'syrie' 'jemen'
 'vluchtelingen' 'saudi_arabie' 'turkije' 'bestrijding_internationaal'
 'actuele_situatie' 'verenigde_naties' 'internationale'
 'buitenlands_beleid' 'unhcr' 'libie' 'palestijnse_autoriteit' 'oor

In [8]:
topic_sizes, topic_nums = model.get_topic_sizes(reduced=False)
topic_words, word_scores, topic_nums = model.get_topics(reduced=False)
climate_idx = [topic_nums for t in topic_nums if t in reduced_topics[10]]


In [24]:
d = {topic:'' for i, r in enumerate(reduced_topics) for topic in r if i in [3]}
d

{167: '',
 161: '',
 132: '',
 242: '',
 2: '',
 133: '',
 124: '',
 123: '',
 34: '',
 190: '',
 148: '',
 162: '',
 196: '',
 68: '',
 140: '',
 134: '',
 100: '',
 217: '',
 69: '',
 153: '',
 122: '',
 67: '',
 35: '',
 231: '',
 18: ''}

In [28]:
for i in natuur_subtopics:
    print(i, topic_words[i])

167 ['schaliegas' 'winning' 'structuurvisie' 'gas' 'mijnbouw' 'moratorium'
 'ambitieus' 'fossiele_brandstoffen' 'gaswinning' 'winnen'
 'stimulering_duurzame' 'olie' 'vergunningen' 'provincie' 'grondwater'
 'aardbevingen' 'schoon' 'deze_kabinetsperiode' 'draagvlak' 'drinkwater'
 'groningenveld' 'waddenzee' 'nut' 'natuurbeleid' 'noodzaak' 'lokaal'
 'transitie_naar' 'grondgebied' 'radicalisering' 'bodemdaling'
 'duurzame_energie' 'vervuiling' 'friesland' 'onderbouwd' 'voorwaarde'
 'onderzoeksraad_voor' 'aardgas' 'ehs' 'waterbeleid' 'provincies' 'irak'
 'kansrijke' 'kernenergie' 'warmte' 'gaswet' 'subsidiering' 'regime'
 'gaswinning_groningen' 'mer' 'beleidsnota_biotechnologie']
161 ['waddenzee' 'bodemdaling' 'mijnbouw' 'gaswinning' 'gas' 'natuur'
 'winning' 'gaswinning_groningen' 'nam' 'schade' 'aardbevingen' 'noordzee'
 'onderzoeksraad_voor' 'groningen' 'waterbeleid' 'schaliegas' 'vergunning'
 'vergunningen' 'eilanden' 'olie' 'vervuiling' 'stimulering_duurzame'
 'daling' 'bes_fonds' 'han

In [32]:
doc_ids = list(range(len(documents)))
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=True)

In [33]:
topics = {
  0: 'Onderwijs',  
  1: 'Buitenlandse zaken',  
  2: 'Algemene zaken',  
  3: 'Natuur & gaswinning',  
  4: 'Landbouw & dierenwelzijn',  
  5: 'Zorg',  
  6: 'Sociale zaken',  
  7: 'Justitie',  
  8: 'Pensioenstelsel',  
  9: 'Europese Unie',  
  10: 'Klimaat & energie',  
  11: 'Milieu & regelgeving',  
  12: 'Zorg',  
  13: 'Openbaar vervoer',  
  14: 'Financiele sector',  
  15: 'Wonen'
  }

doc_ids = list(range(len(documents)))
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=True)
topic_names = [topics[t] for t in topic_nums]
assert len(topic_nums) == len(df)
df['Topic'] = topic_names
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(doc_ids,reduced=False)
df['Topic_initial'] = topic_nums

In [34]:
df['Topic_initial']

2009Z00479     61
2009Z00488     61
2009Z00489     61
2009Z00490     61
2009Z00483     54
             ... 
2020Z25826     33
2020Z25819    104
2020Z25825    104
2020Z25824    104
2020Z25822     30
Name: Topic_initial, Length: 29484, dtype: int32

In [35]:
climate_subtopics = {128: 'Afhankelijkheid fossiele brandstoffen',
 165: 'CO2 reductie',
 5: 'Voldoen aan Parijs',
 141: 'Electriciteit',
 126: 'Groningen',
 205: 'Zonnepanelen',
 96: 'Energierekening betalen',
 236: 'Biomassa',
 105: 'Kolencentrales',
 239: 'Windturbines - overlast',
 29: 'Windturbines - subsidie'}

In [39]:
df['Klimaat'] = df.loc[df['Topic']=='Klimaat & energie', 'Topic_initial'].map(climate_subtopics)

In [40]:
save(df[:1000], 'df_including_topics.pickle')
save(df, 'df_including_topics_full.pickle')

In [115]:
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('dutch')
count = Counter([word for doc in sentence_stream for word in doc if word not in stop_words])
WordCloud(width=600, height=400, background_color='yellow').generate_from_frequencies(count).to_file('wordcloud.png')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jesse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
