In [45]:
import pandas as pd
import numpy as np
import scipy.sparse
import nltk
import warnings
import re
import scipy.sparse

warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import matutils, models
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from gensim import matutils, models
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize

from utils.system import *

In [None]:
data = pd.read_parquet(get_data() / 'clean_data.parquet.brotli')

In [48]:
nltk.download('punkt')
tokenized_texts = [word_tokenize(doc.lower()) for doc in data['cleaned_article']]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weigfan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
id2word = Dictionary(tokenized_texts)
corpus = [id2word.doc2bow(text) for text in tokenized_texts]

In [61]:
# LDA model
num_topic = 10
lda_model = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topic, passes=50)
lda_model.print_topics()

[(0,
  '0.106*"i" + 0.062*"." + 0.035*"to" + 0.024*"n\'t" + 0.022*"and" + 0.021*"do" + 0.020*"it" + 0.020*"me" + 0.019*"\'m" + 0.016*"just"'),
 (1,
  '0.123*"*" + 0.112*"!" + 0.067*"?" + 0.027*";" + 0.021*"you" + 0.021*"#" + 0.018*"x200b" + 0.017*":" + 0.017*")" + 0.015*"]"'),
 (2,
  '0.044*"." + 0.040*"i" + 0.035*"and" + 0.031*"was" + 0.030*"she" + 0.027*"me" + 0.025*"her" + 0.023*"," + 0.023*"to" + 0.019*"a"'),
 (3,
  '0.065*"i" + 0.049*"." + 0.032*"and" + 0.030*"my" + 0.028*"," + 0.026*"to" + 0.023*"a" + 0.022*"the" + 0.018*"of" + 0.016*"in"'),
 (4,
  '0.028*"youtube" + 0.024*"17" + 0.013*"meet" + 0.012*"live" + 0.011*"politics" + 0.011*"circuitry" + 0.010*"mostly" + 0.010*"anyways" + 0.009*"enjoy" + 0.008*"prefer"'),
 (5,
  '0.090*"," + 0.028*"i" + 0.026*"a" + 0.023*"." + 0.021*"of" + 0.020*"to" + 0.018*"and" + 0.013*")" + 0.013*"(" + 0.011*"in"'),
 (6,
  '0.061*"," + 0.040*"you" + 0.039*"." + 0.036*"the" + 0.027*"to" + 0.022*"and" + 0.022*"of" + 0.021*"a" + 0.021*"that" + 0.016*"i

In [62]:
coherence_model = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f'Coherence Score: {coherence_score}')

Coherence Score: 0.3887702534700664


In [65]:
# Get topic distribution for each document
topic_distributions = [lda_model.get_document_topics(bow) for bow in corpus]

doc_topics = []
for distribution in topic_distributions:
    topics = [0] * num_topic
    for topic, weight in distribution:
        topics[topic] = weight
    
    doc_topics.append(topics)

# Convert to DataFrame
topics_df = pd.DataFrame(doc_topics, columns=[f'Topic_{i}' for i in range(num_topic)])
topics_df.index = data.index

In [66]:
topics_df.to_parquet(get_data() / 'lda_topic.parquet.brotli', compression='brotli')