In [4]:
import pickle
import pandas as pd

from datetime import datetime
from gensim.models import TfidfModel
from gensim.models.ldamodel import LdaModel
from gensim.models.nmf import Nmf
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
import numpy as np
from gensim.models.phrases import Phrases
from gensim.utils import simple_preprocess

import pprint
import spacy
from ast import literal_eval
import re

from tabulate import tabulate
from texttable import Texttable
import latextable

In [6]:
def lemmatize_data(prep_tokenized, nlp):
    """
    Create lemmatised tokens column from tokenised text
    """
    lemmatized = []
    for text in prep_tokenized:
        doc = nlp(' '.join(text))
        lemmatized.append([token.lemma_ for token in doc])
    return lemmatized

def tokenize_clean(texts):
    """ 
    Clean text, tokenise, remove punctuation, deaccentuate, lowercase and min-max length (tokens)
    """
    for text in texts:
        yield(simple_preprocess(str(text), deacc=True, min_len=1, max_len=100))  # removes punctuation, lowercases

In [28]:
"""
Create lemmatised column
"""
# nlp = spacy.load('nl_core_news_sm')

# df = pd.read_csv('data/complete-clean-preprocessed-data-2010-2020-1.tsv', sep='\t')
# tokenized = tokenize_clean(df['preprocessed_hlead'].to_list())
# lemmatized_tokenized = lemmatize_data(tokenized, nlp)
# df['lemmatized_tokenized'] = lemmatized_tokenized
# df.to_csv('data/complete-clean-preprocessed-data-2010-2020-1.tsv', sep='\t', index=False)

In [7]:
def doc_term_matrix(data):
    """
    Create document-term matrix from lemmatised tokens column
    """
    lemmatized_tokenized = data.tolist()
    dictionary = Dictionary(lemmatized_tokenized)
    
    # Reove tokens that appear in 90% of texts
    dictionary.filter_extremes(no_below=1, no_above=0.9)

    # bag-of-words
    corpus = [dictionary.doc2bow(text) for text in lemmatized_tokenized]
    
    # tf-idf
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    print("Finished doc_term_matrix")
    return corpus_tfidf, corpus, dictionary

In [8]:
def evaluation(model, data, dictionary, corpus):
    """
    calculate coherence score of model / or perplexity (unused in thesis)
    """
    coherence_model = CoherenceModel(model=model, texts=data, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
#     perplexity = model.log_perplexity(corpus)
    return coherence#, perplexity

In [9]:
def hyperparameter_tuning(data, model, dictionary, corpus, name):
    """
    hyperprarmeter tuning on number of topics; each model is saved into a folder
    """
    # Change depending on models one wants to check
    models = [model(corpus, num_topics=5, id2word = dictionary, passes=20, random_state=123),
             model(corpus, num_topics=10, id2word = dictionary, passes=20, random_state=123),
             model(corpus, num_topics=20, id2word = dictionary, passes=20, random_state=123),
             model(corpus, num_topics=40, id2word = dictionary, passes=20, random_state=123),
             ]
    coherences = []
#     perplexities = []
    num = 0
    for model in models:
        coherence = evaluation(model, data, dictionary, corpus)
        coherences.append(coherence)
#         perplexities.append(perplexity)
        print("Finished evaluating model " + str(num))
        path = 'models/' + name + '_' + str(num) + '.pkl'
        pickle.dump(model, open(path, 'wb'))
        num += 1
    return coherences, models

In [10]:
df = pd.read_csv('data/complete-clean-preprocessed-data-2010-2020-1.tsv', sep='\t', converters={'lemmatized_tokenized': literal_eval})
data = df['lemmatized_tokenized']
corpus_tfidf, corpus, dictionary = doc_term_matrix(data)

Finished doc_term_matrix


In [33]:
# Hyperparameter tune the number of topics using coherence
coherences_lda, lda_models = hyperparameter_tuning(data, LdaModel, dictionary, corpus, 'lda_model')
# Get index of highest coherence value, can also be changed to perplexity
best_model_index_lda = np.argmax(coherences_lda)
lda_model = lda_models[best_model_index_lda]
lda_model.print_topics(-1)

Finished evaluating model 0
Finished evaluating model 1
Finished evaluating model 2
Finished evaluating model 3


[(0,
  '0.032*"shell" + 0.020*"raffinaderij" + 0.016*"atm" + 0.011*"prinses" + 0.008*"olie" + 0.008*"nigeria" + 0.007*"hydraulisch" + 0.007*"matras" + 0.006*"beatrix" + 0.005*"harmsen"'),
 (1,
  '0.017*"club" + 0.016*"baan" + 0.012*"meierijstad" + 0.010*"new" + 0.009*"circuit" + 0.009*"buitenring" + 0.008*"york" + 0.008*"stein" + 0.007*"voertuig" + 0.007*"ijsbaan"'),
 (2,
  '0.028*"caverne" + 0.007*"amtsvenn" + 0.005*"obc" + 0.003*"lutte" + 0.003*"overbetuw" + 0.002*"dhg" + 0.002*"kadernota" + 0.001*"hef" + 0.001*"nachricht" + 0.001*"westfalisch"'),
 (3,
  '0.021*"dat" + 0.018*"te" + 0.018*"voor" + 0.016*"zijn" + 0.014*"niet" + 0.014*"je" + 0.013*"die" + 0.013*"worden" + 0.012*"met" + 0.012*"als"'),
 (4,
  '0.092*"b" + 0.084*"c" + 0.079*"a" + 0.025*"v" + 0.020*"welk" + 0.016*"arnhem" + 0.014*"nijmeg" + 0.009*"stadion" + 0.008*"nijmeegs" + 0.005*"hoe"'),
 (5,
  '0.005*"dorpshart" + 0.005*"leimuid" + 0.004*"jerrycan" + 0.004*"schreuder" + 0.003*"comol" + 0.002*"rivierslib" + 0.002*"bruid

In [276]:
# Hyperparameter tune the number of topics using coherence
coherences_nmf, nmf_models = hyperparameter_tuning(data, Nmf, dictionary, corpus, 'nmf_model')
# Get index of highest coherence value, can also be changed to perplexity
best_model_index_nmf = np.argmax(coherences_nmf)
nmf_model = nmf_models[best_model_index_nmf]
nmf_model.print_topics(-1)

Finished evaluating model 0
Finished evaluating model 1
Finished evaluating model 2
Finished evaluating model 3


[(0,
  '0.073*"dat" + 0.070*"ook" + 0.037*"er" + 0.036*"zijn" + 0.034*"met" + 0.032*"worden" + 0.025*"uit" + 0.014*"al" + 0.013*"te" + 0.013*"deze"'),
 (1,
  '0.044*"of" + 0.024*"als" + 0.019*"dan" + 0.016*"niet" + 0.016*"ook" + 0.014*"die" + 0.013*"tot" + 0.010*"kan" + 0.009*"deze" + 0.009*"over"'),
 (2,
  '0.080*"die" + 0.049*"voor" + 0.038*"te" + 0.020*"om" + 0.019*"naar" + 0.018*"zijn" + 0.017*"er" + 0.015*"uit" + 0.013*"over" + 0.012*"worden"'),
 (3,
  '0.037*"met" + 0.036*"was" + 0.034*"die" + 0.025*"nog" + 0.023*"zijn" + 0.019*"hebben" + 0.016*"er" + 0.014*"werd" + 0.010*"grond" + 0.010*"nieuw"'),
 (4,
  '0.063*"je" + 0.033*"te" + 0.027*"zijn" + 0.021*"gebrek" + 0.018*"woning" + 0.015*"bij" + 0.014*"of" + 0.013*"als" + 0.013*"aan" + 0.012*"verkoper"'),
 (5,
  '0.074*"we" + 0.028*"met" + 0.018*"zijn" + 0.017*"maar" + 0.016*"gaan" + 0.015*"om" + 0.015*"die" + 0.013*"naar" + 0.012*"te" + 0.010*"niet"'),
 (6,
  '0.074*"dat" + 0.060*"als" + 0.049*"te" + 0.019*"voor" + 0.016*"niet" + 

In [35]:
# Hyperparameter tune the number of topics using coherence
coherences_lda_tfidf, lda_models_tfidf = hyperparameter_tuning(data, LdaModel, dictionary, corpus_tfidf, 'lda_model_tfidf')
# Get index of highest coherence value, can also be changed to perplexity
best_model_index_lda_tfidf = np.argmax(coherences_lda_tfidf)
lda_model_tfidf = lda_models_tfidf[best_model_index_lda_tfidf]
lda_model_tfidf.print_topics(-1)

Finished evaluating model 0
Finished evaluating model 1
Finished evaluating model 2
Finished evaluating model 3


[(0,
  '0.002*"ik" + 0.002*"we" + 0.002*"gemeente" + 0.002*"provincie" + 0.002*"hij" + 0.002*"grond" + 0.002*"je" + 0.002*"terrein" + 0.001*"ze" + 0.001*"onderzoek"'),
 (1,
  '0.001*"asbestdak" + 0.001*"staalslak" + 0.001*"staalslakken" + 0.000*"hoeve" + 0.000*"tata" + 0.000*"vredenburg" + 0.000*"hvc" + 0.000*"waadhoek" + 0.000*"ballast" + 0.000*"nedam"'),
 (2,
  '0.003*"drugsafval" + 0.002*"wml" + 0.001*"drug" + 0.001*"dader" + 0.001*"meierijstad" + 0.001*"rijsenhout" + 0.001*"xtc" + 0.001*"synthetisch" + 0.001*"bentum" + 0.001*"sulfaat"'),
 (3,
  '0.002*"schaliegas" + 0.001*"coa" + 0.001*"recyclingbedrijf" + 0.001*"tankstation" + 0.001*"roermond" + 0.001*"vitens" + 0.001*"afvalstof" + 0.001*"haarlemmermeer" + 0.001*"miljard" + 0.001*"eindhoven"'),
 (4,
  '0.001*"griftpark" + 0.001*"aldi" + 0.000*"putman" + 0.000*"abn" + 0.000*"amro" + 0.000*"geldrop" + 0.000*"mierlo" + 0.000*"willemsoord" + 0.000*"volgermeer" + 0.000*"zaans"'),
 (5,
  '0.001*"culemborg" + 0.000*"oisterwijk" + 0.000*"

In [36]:
# Hyperparameter tune the number of topics using coherence
coherences_nmf_tfidf, nmf_models_tfidf = hyperparameter_tuning(data, Nmf, dictionary, corpus_tfidf, 'nmf_model_tfidf')
# Get index of highest coherence value, can also be changed to perplexity
best_model_index_nmf_tfidf = np.argmax(coherences_nmf_tfidf)
nmf_model_tfidf = nmf_models_tfidf[best_model_index_nmf_tfidf]
nmf_model_tfidf.print_topics(-1)

Finished evaluating model 0
Finished evaluating model 1
Finished evaluating model 2
Finished evaluating model 3


[(16,
  '0.008*"we" + 0.006*"je" + 0.004*"ze" + 0.003*"ons" + 0.003*"wat" + 0.003*"wij" + 0.003*"goed" + 0.003*"maar" + 0.003*"onze" + 0.002*"mens"'),
 (33,
  '0.005*"bedrijf" + 0.004*"grondverzet" + 0.004*"boer" + 0.004*"machine" + 0.003*"hij" + 0.003*"werk" + 0.003*"agrarisch" + 0.002*"werken" + 0.002*"uur" + 0.002*"klant"'),
 (31,
  '0.011*"plant" + 0.011*"duizendknoop" + 0.006*"exoot" + 0.005*"wetenschapper" + 0.005*"spitsberg" + 0.005*"expeditie" + 0.004*"japans" + 0.004*"dier" + 0.004*"eiland" + 0.004*"soort"'),
 (37,
  '0.007*"woning" + 0.005*"plan" + 0.005*"bouw" + 0.004*"bouwen" + 0.004*"appartement" + 0.003*"centrum" + 0.003*"ontwikkelaar" + 0.003*"huis" + 0.002*"nieuw" + 0.002*"nieuwbouw"'),
 (32,
  '0.029*"ik" + 0.010*"mijn" + 0.007*"heb" + 0.006*"ben" + 0.005*"hij" + 0.005*"me" + 0.003*"je" + 0.003*"mij" + 0.002*"doornbos" + 0.002*"u"'),
 (13,
  '0.020*"pand" + 0.005*"olafstraat" + 0.004*"gebouw" + 0.004*"stomerij" + 0.004*"verkoop" + 0.004*"eigenaar" + 0.004*"verkopen" + 

In [255]:
# Load in models with highest topic coherence score
lda_model = pickle.load(open('models/lda_model_2.pkl', 'rb'))
nmf_model = pickle.load(open('models/nmf_model_3.pkl', 'rb'))
lda_model_tfidf = pickle.load(open('models/lda_model_tfidf_1.pkl', 'rb'))
nmf_model_tfidf = pickle.load(open('models/nmf_model_tfidf_3.pkl', 'rb'))

In [251]:
def get_topic_lists(model, num_topics, num_words):
    """ Get topic_ids, topic words and probabilities of a model"""
    topic_ids = []
    topic_words_per_topic = []
    topic_probs_per_topic = []
    
    topic_list = model.print_topics(num_topics=num_topics, num_words=num_words)
    for topic in topic_list:

        topic_words = []

        topic_probs = []


        topic_id = str(topic[0])
        topic_ids.append(topic_id)

        topic_words_list = topic[1]

        topic_prob_word_list = topic_words_list.split('+')
        for topic_prob_word in topic_prob_word_list:
            topic_prob, topic_word = topic_prob_word.split('*')
            topic_word = re.sub("\"", '', topic_word)
            topic_words.append(topic_word)
            topic_probs.append(topic_prob)
        topic_words_per_topic.append(topic_words)
        topic_probs_per_topic.append(topic_probs)
    
    return topic_ids, topic_words_per_topic, topic_probs_per_topic


def to_latex(topic_ids, topic_words_per_topic, num_topics, caption, label):
    """
    Use get_topic_lists() to get topic_ids, words and probabilities to create Latex Tables
    """
    
    # transpose
    topic_words_per_topic_T = list(map(list, zip(*topic_words_per_topic)))
    rows = [topic_ids]

    for words in topic_words_per_topic_T:
        rows.append(words)

    table = Texttable()
    table.set_cols_align(["c"] * num_topics)

    table.set_deco(Texttable.HEADER | Texttable.VLINES)
    table.add_rows(rows)
    
    print('\nTexttable Latex:')
    print(latextable.draw_latex(table, caption=caption, use_booktabs=True, label=label))

In [275]:
# Code to create latex tables of best models
models = [lda_model, nmf_model, lda_model_tfidf, nmf_model_tfidf]
caption = 'a'
label = 'a'
for model in models:
    topic_ids, topic_words_per_topic, topic_probs_per_topic = get_topic_lists(model, 5, 10)
    to_latex(topic_ids, topic_words_per_topic, 5, caption, label)


Texttable Latex:
\begin{table}
	\begin{center}
		\begin{tabular}{c c c c c}
			\toprule
			5 & 12 & 9 & 1 & 8 \\
			\midrule
			dorpshart  & dat  & x  & club  & cat  \\
			leimuid  & zijn  & lent  & baan  & wonderland  \\
			jerrycan  & met  & enk  & meierijstad  & isoo  \\
			schreuder  & ik  & waalbrug  & new  & cate  \\
			comol  & te  & kasteeltuin  & circuit  & catalogusbeeld  \\
			rivierslib  & die  & duiveland  & buitenring  & madelon  \\
			bruid  & maar  & overflakkee  & york  & stegeman  \\
			bliemer  & er  & goere  & stein  & baroktuin  \\
			leimuiden  & niet  & lents  & voertuig  & weerspiegelen  \\
			bruidegom & hebben & vierdaags & ijsbaan & tolweg \\
			\bottomrule
		\end{tabular}
	\end{center}
	\caption{a}
	\label{a}
\end{table}

Texttable Latex:
\begin{table}
	\begin{center}
		\begin{tabular}{c c c c c}
			\toprule
			38 & 16 & 25 & 30 & 15 \\
			\midrule
			uur  & te  & grond  & dat  & a  \\
			tot  & door  & pfa  & die  & c  \\
			aan  & met  & stof  & om  & b  

In [45]:
# unused
def format_topics_sentences(ldamodel, corpus, texts):
    """
    Code to format topics into dataframe, including keywords and topic contribution
    
    """
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(lda_model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, df['preprocessed_hlead'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,17.0,0.2945,"dat, te, worden, er, niet, voor, zijn, met, om...",In februari 1945 verging de Duitse onderzeeboo...
1,1,26.0,0.5079,"grond, dat, worden, zijn, er, vervuilen, met, ...",De voormalige stortplaats in de Kanaalpolder b...
2,2,12.0,0.7969,"met, voor, zijn, we, te, die, dat, ook, aan, er",Tijdens het prinsenbal van de Peelpluimen in d...
3,3,10.0,0.393,"worden, voor, aan, met, er, te, gemeente, dat,...",Het plan voor de bouw van twaalf woningen op h...
4,4,17.0,0.6951,"dat, te, worden, er, niet, voor, zijn, met, om...","De weersvoorspellingen waren goed, het zag er ..."
5,5,26.0,0.6987,"grond, dat, worden, zijn, er, vervuilen, met, ...",De sterk vervuilde grond in de Doornse Heuvelw...
6,6,26.0,0.66,"grond, dat, worden, zijn, er, vervuilen, met, ...","De woonwijk het Franse Gat, grotendeels gebouw..."
7,7,26.0,0.3555,"grond, dat, worden, zijn, er, vervuilen, met, ...",Voor bewoners van de Veenendaalse wijk het Fra...
8,8,17.0,0.2971,"dat, te, worden, er, niet, voor, zijn, met, om...",Het grote (financiële) knelpunt is de aanwezig...
9,9,23.0,0.52,"ik, dat, zijn, niet, hij, je, maar, ze, die, h...",Ogenschijnlijk doen de bewoners van Franse Gat...
