In [None]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import regex as re
import utilities.helpers as hp

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

import pyLDAvis.sklearn

plt.style.use(style='seaborn')
%matplotlib inline
sns.set()


# ALL cases topic modelling: 

# Import data set:

In [None]:
pd.options.mode.chained_assignment = None

df_full = hp.import_dataset(encoding='utf-8')
df_full.shape


# Dataset preprocessing:

All cases except for Unknown Homeland (Ukendt hjemland) are kept.

Below we see the shape and head of the dataset.

In [None]:
df_UH = df_full[df_full.country != "Ukendt hjemland"]

# Drop redundant column:
df_UH.drop('hasText', axis=1, inplace=True)

# Keep column 'text':
df = df_UH[['text']]

df["text"] = df["text"].astype(str)

df.shape

In [None]:

df.head()

Check for potential duplicates and drop them:

In [None]:
duplicates_df = df[df.duplicated(['text'], keep=False)]
#pd.concat(g for _, g in df.groupby("text") if len(g) > 1)

# See all duplicates:
duplicates_df

New shape for dataset after droping duplicate entries.

In [None]:
# Drop duplicates from original dataframe:
df = df.drop_duplicates()

# Check shape again:
df.shape

# Use a pipeline to pre-process the texts:

The pipeline consists of the following steps:

- Lowercase all characters
- Drop numbers
- Remove punctuation
- Remove stopwords. The list of stopwords can be found [here](https://github.com/jethronap/AsylumData_KU/blob/main/misc/stopwords_dk.txt).
- Single letter words are dropped, too.
- Tokenization
- Lemmatization. The process during which all words are turned into its roots. 

Steps can be added or removed from the pipeline.

In [None]:
pipeline = [str.lower, hp.drop_numbers, hp.remove_punctuation, hp.remove_stopwords, hp.drop_single_letter_words,
            hp.tokenize, hp.lemmatize]


In [None]:
# Apply the process pipeline:
df['tokens'] = df['text'].apply(hp.process, pipeline=pipeline)

# Add column to see the number of tokens:
df['num_tokens'] = df['tokens'].map(len)

In [None]:
df.head()

# Analysis:

In [None]:
df['tokens_joined'] = [' '.join(token) for token in df['tokens']]

In [None]:
# Use tf (raw term count) features for LDA.
count_text_vectorizer = CountVectorizer(min_df=2, max_df=0.7)
count_text_vectors = count_text_vectorizer.fit_transform(df['tokens_joined'])

# Use tf-idf features for NMF and SVD
tfidf_text_vectorizer = TfidfVectorizer(min_df=2, max_df=0.7)
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(df['tokens_joined'])
# tfidf_text_vectors.shape

## Topic Modelling:

### Latent Dirichlet Allocation (LDA):

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

best_num_LDA = float('NaN')
best_score_LDA = 0

# Compute the coherence scores for each number of topics
for i in range(2, 11):

    # Create LDA model with i topics
    LDA_text_model = LatentDirichletAllocation(n_components=i, random_state=42)
    W_LDA_text_matrix = LDA_text_model.fit_transform(count_text_vectors)
    H_LDA_text_matrix = LDA_text_model.components_

    # Obtain the coherence score
    coherence_model_LDA = metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=H_LDA_text_matrix, 
                        dtm=W_LDA_text_matrix, 
                        vocab=np.array([x for x in count_text_vectorizer.vocabulary_.keys()]), 
                        texts=df['tokens'])
    coherence_score_LDA = np.around(coherence_model_LDA, 2)
    for score in coherence_score_LDA:
        if score > best_score_LDA:
            best_num_LDA = i
            best_score_LDA = score

print(f'The coherence score for LDA ({best_score_LDA}) is highest with {best_num_LDA} topics.')

Build the model with the best number of topics and see them:

In [None]:
LDA_model = LatentDirichletAllocation(n_components=best_num_LDA, random_state=42)
W_LDA_model_matrix = LDA_model.fit_transform(count_text_vectors)
H_LDA_model_matrix = LDA_model.components_

In [None]:
hp.display_topics(LDA_model, count_text_vectorizer.get_feature_names_out())

Visualise the topics produced by LDA:

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

LDA_display = pyLDAvis.sklearn.prepare(LDA_model, count_text_vectors, count_text_vectorizer, sort_topics=False)

pyLDAvis.display(LDA_display)

In [None]:
pyLDAvis.save_html(LDA_display, 'LDA_ALL.html')

Each bubble in the plot represents a topic. The size of the bubble represents the proportion of cases that contain the topic, with a larger bubble corresponding to a higher proportion. 

The distance between the bubbles represents the similarity between the topics; the shorter the distance, the more similar the topics.

The bars in the bar chart represent the term frequency for each of the words. The blue bars show the overall term frequency in the collection of documents, whereas the red bars show the term frequency for the selected topic.

### Non-Negative Matrix Factorization:

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

best_num_NMF = float('NaN')
best_score_NMF = 0

# Compute the coherence scores for each number of topics
for i in range(2, 11):

    # Create NMF model with i topics
    NMF_text_model = NMF(n_components=i, random_state=42, max_iter=2000)
    W_NMF_text_matrix = NMF_text_model.fit_transform(tfidf_text_vectors)
    H_NMF_text_matrix = NMF_text_model.components_

    # Obtain the coherence score
    coherence_model_NMF = metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=H_NMF_text_matrix, 
                        dtm=W_NMF_text_matrix, 
                        vocab=np.array([x for x in tfidf_text_vectorizer.vocabulary_.keys()]), 
                        texts=df['tokens'])
    coherence_score_NMF = np.around(coherence_model_NMF, 2)
    for score in coherence_score_NMF:
        if score > best_score_NMF:
            best_num_NMF = i
            best_score_NMF = score

print(f'The coherence score for NMF ({best_score_NMF}) is highest with {best_num_NMF} topics.')

Build the model with the best number of topics and see them:

In [None]:
NMF_model = NMF(n_components=best_num_NMF, random_state=42, max_iter=2000)
W_NMF_model_matrix = NMF_model.fit_transform(tfidf_text_vectors)
H_NMF_model_matrix = NMF_model.components_

The numbers inside the parentheses are the percentages with which the words contribute to the topics.

In [None]:
hp.display_topics(NMF_model, count_text_vectorizer.get_feature_names_out())

Visualise the topics produced by NMF:

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)


NMF_display = pyLDAvis.sklearn.prepare(NMF_model, tfidf_text_vectors, tfidf_text_vectorizer, sort_topics=False)

pyLDAvis.display(NMF_display)

In [None]:
pyLDAvis.save_html(NMF_display, 'NMF_ALL.html')

### Latent Semantic Analysis/Indexing:

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

best_num_SVD = float('NaN')
best_score_SVD = 0

# Compute the coherence scores for each number of topics
for i in range(2, 11):

    # Create SVD model with i topics
    SVD_text_model = TruncatedSVD(n_components=i, random_state=42)
    W_SVD_text_matrix = SVD_text_model.fit_transform(tfidf_text_vectors)
    H_SVD_text_matrix = SVD_text_model.components_

    # Obtain the coherence score
    coherence_model_SVD = metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=H_SVD_text_matrix, 
                        dtm=W_SVD_text_matrix, 
                        vocab=np.array([x for x in tfidf_text_vectorizer.vocabulary_.keys()]), 
                        texts=df['tokens'])
    coherence_score_SVD = np.around(coherence_model_SVD, 2)
    for score in coherence_score_SVD:
        if score > best_score_SVD:
            best_num_SVD = i
            best_score_SVD = score

print(f'The coherence score for SVD ({best_score_SVD}) is highest with {best_num_SVD} topics.')

Build the model with the best number of topics and see them:

In [None]:
SVD_model = TruncatedSVD(n_components=best_num_SVD, random_state=42)
W_SVD_model_matrix = SVD_model.fit_transform(tfidf_text_vectors)
H_SVD_model_matrix = SVD_model.components_

In [None]:
hp.display_topics(SVD_model, tfidf_text_vectorizer.get_feature_names_out())

In [None]:
# SVD_display = pyLDAvis.sklearn.prepare(SVD_model, tfidf_text_vectors, tfidf_text_vectorizer, sort_topics=False)

# pyLDAvis.display(SVD_display)

# Visualizations:

## Wordclouds:

## WordClouds from the LDA model:

In [None]:
hp.wordcloud_topics(LDA_model, count_text_vectorizer.get_feature_names_out())

## WordClouds from the NMF model:

In [None]:
hp.wordcloud_topics(NMF_model, tfidf_text_vectorizer.get_feature_names_out())

## WordsClouds from the SVD model:

In [None]:
hp.wordcloud_topics(SVD_model, tfidf_text_vectorizer.get_feature_names_out())