In [None]:
import pickle
import pandas as pd

from datetime import datetime
from gensim.models import TfidfModel
from gensim.models.ldamodel import LdaModel
from gensim.models.nmf import Nmf
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
import numpy as np
from gensim.models.phrases import Phrases
from gensim.utils import simple_preprocess

import pprint
import pyLDAvis.gensim_models
import spacy
from ast import literal_eval

In [None]:
def lemmatize_data(prep_tokenized, nlp):
    nlp = spacy.load('nl_core_news_sm')
    lemmatized = []
    for text in prep_tokenized:
        doc = nlp(' '.join(text))
        lemmatized.append([token.lemma_ for token in doc])
    return lemmatized

def tokenize_clean(texts):
    for text in texts:
        yield(simple_preprocess(str(text), deacc=True, min_len=1, max_len=100))  # removes punctuation, lowercases

In [None]:
# df = pd.read_csv('data/complete-clean-preprocessed-data-2010-2020.tsv', sep='\t')
# tokenized = tokenize_clean(df['preprocessed_hlead'].to_list())
# lemmatized_tokenized = lemmatize_data(tokenized, nlp)
# df['lemmatized_tokenized'] = lemmatized_tokenized
# df.to_csv('data/complete-clean-preprocessed-data-2010-2020.tsv', sep='\t', index=False)

In [None]:
def doc_term_matrix(data):
    lemmatized_tokenized = data.tolist()
    dictionary = Dictionary(lemmatized_tokenized)
    dictionary.filter_extremes(no_below=1, no_above=0.9)

    # Term Document Frequency
    corpus = [dictionary.doc2bow(text) for text in lemmatized_tokenized]
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    print("Finished doc_term_matrix")
    return corpus_tfidf, corpus, dictionary

In [None]:
def evaluation(model, data, dictionary, corpus):
    coherence_model = CoherenceModel(model=model, texts=data, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
#     perplexity = model.log_perplexity(corpus)
    return coherence#, perplexity

In [None]:
def hyperparameter_tuning(data, model, dictionary, corpus, name):
    
    # Change depending on models one wants to check
    models = [model(corpus, num_topics=5, id2word = dictionary, passes=20, random_state=123),
             model(corpus, num_topics=10, id2word = dictionary, passes=20, random_state=123),
             model(corpus, num_topics=20, id2word = dictionary, passes=20, random_state=123),
             model(corpus, num_topics=40, id2word = dictionary, passes=20, random_state=123),
             ]
    coherences = []
#     perplexities = []
    num = 0
    for model in models:
        coherence = evaluation(model, data, dictionary, corpus)
        coherences.append(coherence)
#         perplexities.append(perplexity)
        print("Finished evaluating model " + str(num))
        path = 'models/' + name + '_' + str(num) + '.pkl'
        pickle.dump(model, open(path, 'wb'))
        num += 1
    return coherences, models

In [None]:
df = pd.read_csv('data/complete-clean-preprocessed-data-2010-2020.tsv', sep='\t', converters={'lemmatized_tokenized': literal_eval})
data = df['lemmatized_tokenized']
corpus_tfidf, corpus, dictionary = doc_term_matrix(data)

In [None]:
# Hyperparameter tune the number of topics using coherence
coherences_lda, lda_models = hyperparameter_tuning(data, LdaModel, dictionary, corpus, 'lda_model')
# Get index of highest coherence value, can also be changed to perplexity
best_model_index_lda = np.argmax(coherences_lda)
lda_model = lda_models[best_model_index_lda]
# pickle.dump(lda_model, open('models/lda_model.pkl', 'wb'))
lda_model.print_topics()

In [None]:
# Hyperparameter tune the number of topics using coherence
coherences_nmf, nmf_models = hyperparameter_tuning(data, Nmf, dictionary, corpus, 'nmf_model')
# Get index of highest coherence value, can also be changed to perplexity
best_model_index_nmf = np.argmax(coherences_nmf)
nmf_model = nmf_models[best_model_index_nmf]
# pickle.dump(nmf_model, open('models/nmf_model.pkl', 'wb'))
nmf_model.print_topics()

In [None]:
# Hyperparameter tune the number of topics using coherence
coherences_lda_tfidf, lda_models_tfidf = hyperparameter_tuning(data, LdaModel, dictionary, corpus_tfidf, 'lda_model_tfidf')
# Get index of highest coherence value, can also be changed to perplexity
best_model_index_lda_tfidf = np.argmax(coherences_lda_tfidf)
lda_model_tfidf = lda_models_tfidf[best_model_index_lda_tfidf]
# pickle.dump(lda_model_tfidf, open('models/lda_model_tfidf.pkl', 'wb'))
lda_model_tfidf.print_topics()

In [None]:
# Hyperparameter tune the number of topics using coherence
coherences_nmf_tfidf, nmf_models_tfidf = hyperparameter_tuning(data, Nmf, dictionary, corpus_tfidf, 'nmf_model_tfidf')
# Get index of highest coherence value, can also be changed to perplexity
best_model_index_nmf_tfidf = np.argmax(coherences_nmf_tfidf)
nmf_model_tfidf = nmf_models_tfidf[best_model_index_nmf_tfidf]
# pickle.dump(nmf_model_tfidf, open('models/nmf_model_tfidf.pkl', 'wb'))
nmf_model_tfidf.print_topics()

In [None]:
df

In [None]:
from collections import Counter
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
topics = lda_model_tfidf.show_topics(formatted=False)
data_flat = [w for w_list in lemmatized_tokenized for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])        

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 1, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(lda_model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, df['preprocessed_hlead'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
df_dominant_topic[df_dominant_topic['Dominant_Topic'] == 10]