*I tried to apply <span style="color:#e74c3c;"> Latent Dirichlet Allocation (LDA) </span> model for topic modelling [1].*

* I applied several <b>preprocessing</b> operations (cleaning,dropping stop words, lemmatizing),
* I have chosen <b> 20 for the number of topics</b>. (I tried the topics number value from 5 to 45, but there was no betterment in respect of the Coherence Score.)
* Topic modelling visualization with <b>pyLDAvis</b> library [2].
* An LDA model evaluation with <b>the Coherence Score</b>.


## My Another Projects
* [Manufacturing Question-Answer w/Fine-Tuning Gemma 7B (LoRA)](https://www.kaggle.com/code/banddaniel/manufacturing-question-answer-w-gemma-7b-lora)
* [News Analysis w/Tensorflow (DistilBERT)](https://www.kaggle.com/code/banddaniel/news-analysis-w-tensorflow-distilbert)
* [Complaint Analysis w/Ensemble Model (CatBoost, LR)](https://www.kaggle.com/code/banddaniel/complaint-analysis-w-ensemble-model-catboost-lr)


## References
1. https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
2. https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [1]:
import os
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import  stopwords
import string
import spacy
import pprint

from gensim.models import LdaModel, CoherenceModel
from gensim.corpora import Dictionary

import pyLDAvis.gensim
import pickle 
import pyLDAvis

SEED = 32

# <span style="color:#e74c3c;"> Reading </span> Data

In [2]:
data_raw = pd.read_csv('/kaggle/input/bbc-full-text-document-classification/bbc_data.csv')
data_raw.drop_duplicates(subset = ['data'], inplace = True)
data_raw = data_raw.sample(frac = 1, random_state = SEED).reset_index(drop = True)
data_raw.head()

Unnamed: 0,data,labels
0,Wilkinson to miss Ireland match England will ...,sport
1,Ore costs hit global steel firms Shares in st...,business
2,Virgin Radio offers 3G broadcast UK broadcast...,tech
3,Sainsburys Labour election gift Science Minis...,politics
4,Celts savour Grand Slam prospect The Six Nati...,sport


# <span style="color:#e74c3c;"> Preprocessing </span>

In [3]:
# preprocessing functions

stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')

def text_preprocessing(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'@\w+\s*', ' ', text)
    text = re.sub(r'\\W',' ',text) 
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'http', ' ', text)
    text = re.sub(r'<.*?>+', ' ', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', ' ', text)
    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]', ' ', text)
    return text

def drop_stopwords(text):
    dropped = [word for word in text.split() if word not in stop_words]
    final_text = ' '.join(dropped)
    return final_text

def lemmatization(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_text = ' '.join(lemmatized_tokens)    
    return lemmatized_text

def delete_one_characters(text):
    deleted = [word if len(word)>1 else "" for word in text.split()]
    final_text = ' '.join(deleted)
    return final_text

# applying preprocessing functions
data = data_raw.copy()
data['preprocessed_data'] = data['data'].apply(text_preprocessing).apply(drop_stopwords).apply(lemmatization).apply(delete_one_characters)

In [4]:
# data after preprocessing functions
data.head()

Unnamed: 0,data,labels,preprocessed_data
0,Wilkinson to miss Ireland match England will ...,sport,wilkinson miss ireland match england take irel...
1,Ore costs hit global steel firms Shares in st...,business,ore cost hit global steel firm share steel fir...
2,Virgin Radio offers 3G broadcast UK broadcast...,tech,virgin radio offer broadcast uk broadcaster vi...
3,Sainsburys Labour election gift Science Minis...,politics,sainsbury labour election gift science ministe...
4,Celts savour Grand Slam prospect The Six Nati...,sport,celt savour grand slam prospect six nation her...


# <span style="color:#e74c3c;"> Creating </span> Dictionary, Corpus

In [5]:
text_corpus = data['preprocessed_data'].values
nested_document_tokens  = [t.split() for t in text_corpus]

# dictionary
id2word = Dictionary(nested_document_tokens)
id2word.filter_extremes(no_below=5, no_above=0.5)

# corpus
corpus = [id2word.doc2bow(text) for text in nested_document_tokens]

# <span style="color:#e74c3c;"> Latent Dirichlet Allocation </span> Model

In [6]:
# training
NUM_TOPICS = 20

lda_model = LdaModel(corpus, num_topics = NUM_TOPICS, id2word=id2word, iterations = 500, random_state = SEED, passes=10)

In [7]:
# printing some topics
pprint.pprint(lda_model.print_topics()[:3])

[(0,
  '0.028*"music" + 0.014*"release" + 0.014*"song" + 0.014*"use" + '
  '0.014*"record" + 0.014*"apple" + 0.013*"mail" + 0.013*"information" + '
  '0.011*"album" + 0.011*"search"'),
 (1,
  '0.016*"mr" + 0.015*"government" + 0.011*"bill" + 0.011*"plan" + '
  '0.009*"blair" + 0.007*"go" + 0.007*"tory" + 0.007*"minister" + '
  '0.006*"patent" + 0.006*"school"'),
 (2,
  '0.027*"people" + 0.016*"broadband" + 0.013*"one" + 0.012*"report" + '
  '0.011*"uk" + 0.011*"online" + 0.010*"become" + 0.010*"net" + 0.010*"family" '
  '+ 0.010*"many"')]


# <span style="color:#e74c3c;"> Topic Modelling </span> Visualization

In [8]:
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/kaggle/working/ldavis.html')

if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '/kaggle/working/ldavis.html')
LDAvis_prepared

# <span style="color:#e74c3c;"> LDA </span> Evaluation

In [9]:
# creating CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=nested_document_tokens, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print("Coherence Score \t:{0:.7f}".format(coherence_lda))

Coherence Score 	:0.4362164
