In [None]:
import pandas as pd

from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import RobertaTokenizer
from bertopic import BERTopic
from umap import UMAP

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import numpy as np
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
from gensim.models import KeyedVectors
import pickle

In [None]:
def run_bertopic(df_col, umap_model, embedding_model=None, dv=None):
    if dv is None:
        topic_model = BERTopic(umap_model=umap_model, embedding_model=embedding_model)
        topics, probs = topic_model.fit_transform(df_col.tolist())
    else: 
        topic_model = BERTopic(umap_model=umap_model)
        topics, probs = topic_model.fit_transform(df_col.tolist(), dv)
    return topic_model, topics, probs

In [None]:
def t2v_d2v(df_col):
    doc2vec_args = {"vector_size": 300,
                    "min_count": 1,
                    "window": 15,
                    "sample": 1e-5,
                    "negative": 0,
                    "hs": 1,
                    "epochs": 40,
                    "dm": 0,
                    "dbow_words": 1}

    train_corpus = [TaggedDocument(simple_preprocess(strip_tags(doc), deacc=True), [i]) for i, doc in enumerate(df_col)]

    doc2vec_args["documents"] = train_corpus

    model = Doc2Vec(**doc2vec_args)

    model.save("embedding_models/d2v.model")
    
    print("Model Saved")
    return model

def get_doc_vecs(d2v_model):
    dv = d2v_model.docvecs
    dv.save("embedding_models/d2v.docvectors")
    return dv

In [None]:
def coherence_file(coherences, file_name):
    path = 'evaluation/' + file_name + '.txt'
    textfile = open(path, 'w')
    for i, coherence in enumerate(coherences):
        textfile.write(str(i) + ': ' + str(coherence) + "\n")
    textfile.close()
    print('Saved to ' + path)

In [None]:
def prep_topic_words(topic_model, docs, topics):
    documents = pd.DataFrame({"Document": docs,
                              "ID": range(len(docs)),
                              "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
                   for topic in range(len(set(topics))-1)]

    return topic_words, tokens, corpus, dictionary

def evaluation(topic_words, data, dictionary, corpus):
    coherence_model = CoherenceModel(topics= topic_words, texts=data, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
#     perplexity = model.log_perplexity(corpus)
    return coherence#, perplexity

def hyperparameter_tuning(df, name, embedding_model=None, dv=None):
    # Change depending on models one wants to check
    umaps = [UMAP(n_neighbors=5, n_components=5, metric='cosine', low_memory=False, random_state=42),
             UMAP(n_neighbors=10, n_components=5, metric='cosine', low_memory=False, random_state=42),
             UMAP(n_neighbors=15, n_components=5, metric='cosine', low_memory=False, random_state=42),
             UMAP(n_neighbors=5, n_components=10, metric='cosine', low_memory=False, random_state=42),
             UMAP(n_neighbors=10, n_components=10, metric='cosine', low_memory=False, random_state=42),
             UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False, random_state=42),
             ]
    coherences = []
    models = []
#     perplexities = []
    num = 0
    for umap in umaps:
        model, topics, probs = run_bertopic(df['preprocessed_hlead'], umap_model=umap, embedding_model=embedding_model, dv=dv)
        bertopic_model = [model, topics, probs]
        models.append([model, topics, probs])
        topic_words, tokens, corpus, dictionary = prep_topic_words(model, df['preprocessed_hlead'], topics)
        coherence = evaluation(topic_words, tokens, dictionary, corpus)
        coherences.append(coherence)
#         perplexities.append(perplexity)
        print("Finished evaluating model " + str(num))
        path = 'models/' + name + '_' + str(num) + '.pkl'
        pickle.dump(bertopic_model, open(path, 'wb'))
        num += 1
    return coherences, models

In [None]:
df = pd.read_csv('data/complete-clean-preprocessed-data-2010-2020.tsv', sep='\t')

# d2v_model = t2v_d2v(df['preprocessed_hlead'])
# dv = get_doc_vecs(d2v_model)

# doc2vec model
doc_vectors = KeyedVectors.load("embedding_models/d2v.docvectors").vectors_docs
# fine-tuned BERT model 256 tokens
trained_model_256 = AutoModelForMaskedLM.from_pretrained("embedding_models/robbert-v2-dutch-base-finetuned-model/checkpoint-5000")
# fine-tuned BERT model 128 tokens
trained_model_128 = AutoModelForMaskedLM.from_pretrained("embedding_models/robbert-v2-dutch-base-finetuned-model/checkpoint-10000")
# vanilla BERT model
vanilla_model = AutoModelForMaskedLM.from_pretrained("pdelobelle/robbert-v2-dutch-base")

In [None]:
dv_coherences, dv_models = hyperparameter_tuning(df, 'dv', dv=doc_vectors)
coherence_file(dv_coherences, 'dv_models_coherences')
best_dv = dv_models[np.argmax(dv_coherences)]

In [None]:
trained_model_128_coherences, trained_model_128_models = hyperparameter_tuning(df, 'trained_model_128', embedding_model=trained_model_128)
coherence_file(trained_model_128_coherences, '128_models_coherences')
best_trained_model_128 = trained_model_128_models[np.argmax(trained_model_128_coherences)]

In [None]:
trained_model_256_coherences, trained_model_256_models = hyperparameter_tuning(df, 'trained_model_256', embedding_model=trained_model_256)
coherence_file(trained_model_256_coherences, '256_models_coherences')
best_trained_model_256 = trained_model_256_models[np.argmax(trained_model_256_coherences)]

In [None]:
vanilla_coherences, vanilla_models = hyperparameter_tuning(df, 'vanilla', embedding_model=vanilla_model)
coherence_file(vanilla_coherences, 'vanilla_coherences')
best_vanilla = vanilla_models[np.argmax(vanilla_coherences)]

In [None]:
topic_model_vanilla, topics_vanilla, probs_vanilla = best_vanilla

topic_model_vanilla.get_topic_info()
topic_model_vanilla.visualize_topics() 

In [None]:
topic_model_256, topics_256, probs_256 = best_trained_model_256

topic_model_256.get_topic_info()
topic_model_256.visualize_topics() 

In [None]:
topic_model_128, topics_128, probs_128 = best_trained_model_128

topic_model_128.get_topic_info()
topic_model_128.visualize_topics() 

In [None]:
topic_model_dv, topics_dv, probs_dv = best_dv
topic_model_dv.get_topic_info()
topic_model_dv.visualize_topics() 

In [None]:
fig = topic_model_vanilla.visualize_barchart()
fig.write_html('test.html')

In [None]:
?topic_model

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
for i in range(10):
    print('Topic ' + str(i) + ':')
    print(topic_model.get_topic(i))
    print(" ")