In [1]:
%load_ext autoreload
%autoreload 2

import json
import numpy as np
import pandas as pd

from Latent_Dirichlet_Allocation import normalization_string, create_stop_word
from Latent_Dirichlet_Allocation import LDA_In_EM, LDA_In_VB, LDA_In_CGS

%matplotlib inline

In [None]:
df_tweet1 = pd.read_csv('csv_data/20250127_tweet_data.csv')
df_tweet1['text'] = df_tweet1['text'].str.removeprefix('[').str.removesuffix(']')
df_tweet2 = pd.read_csv('csv_data/20250304_tweet_data.csv')
df_tweet2['text'] = df_tweet2['text'].str.removeprefix('[').str.removesuffix(']')
df_tweet = pd.concat([df_tweet1, df_tweet2]).reset_index(drop=True)
df_tweet = df_tweet[['text']]
df_tweet = normalization_string(df_tweet, 'text')
df_tweet = df_tweet.drop_duplicates(subset='text', keep='first', ignore_index=True)

df_tweet.to_parquet('./parquet_data/20250227_tweet_normalized.parquet')
df_tweet

In [None]:
with open('json_data/custom_stopwords_en.json', 'r') as f:
	stop_word_en = json.load(f)
	stop_word_en = stop_word_en['stopwords']

with open('json_data/custom_stopwords_ja.json', 'r') as f:
	stop_word_ja = json.load(f)
	stop_word_ja = stop_word_ja['stopwords']

stop_word_digit1 = [str(idx) for idx in range(10000)]
stop_word_digit2 = [str(idx).zfill(2) for idx in range(100)]
stop_word_digit3 = [str(idx).zfill(3) for idx in range(1000)]
stop_word_digit4 = [str(idx).zfill(4) for idx in range(10000)]
stop_word_alpha  = [chr(idx) for idx in range(ord('a'), ord('z')+1)]
stop_word_ALPHA  = [chr(idx) for idx in range(ord('A'), ord('Z')+1)]
stop_word_hira   = [chr(idx) for idx in range(ord('あ'), ord('ん')+1)]
stop_word_kata   = [chr(idx) for idx in range(ord('ァ'), ord('ン')+1)]
stop_word_kanji  = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
stop_word_greece = [chr(idx) for idx in range(ord('α'), ord('ω')+1)]
stop_word_GREECE = [chr(idx) for idx in range(ord('Α'), ord('Ω')+1)]
stop_word_cyril  = [chr(idx) for idx in range(ord('а'), ord('я')+1)]
stop_word_CYRIL  = [chr(idx) for idx in range(ord('А'), ord('Я')+1)]
stop_word_symbol = ['・', '゚', '!', '。', "'", '_', '%']
stop_word_custom = stop_word_digit1 + stop_word_digit2 + stop_word_digit3 + stop_word_digit4\
    				+ stop_word_alpha + stop_word_ALPHA\
            		+ stop_word_hira  + stop_word_kata + stop_word_kanji\
                  	+ stop_word_greece + stop_word_GREECE\
                    + stop_word_cyril + stop_word_CYRIL\
                    + stop_word_symbol

# stop_word = create_stop_word(df_tweet, 'text', stop_word_en + stop_word_ja, 5)
stop_word = create_stop_word(df_tweet, 'text', stop_word_en + stop_word_ja + stop_word_custom, 5)


In [None]:
model = LDA_In_EM(df_tweet, stop_word=stop_word, topic_num=10)
model.fit()

print(f'topic of EM algorithm: finished')

In [None]:
pd_θ, pd_Φ = model.stats_info()
pd_θ.to_parquet('./parquet_data/20250227_tweet_EM_θ.parquet')
pd_Φ.to_parquet('./parquet_data/20250227_tweet_EM_Φ.parquet')

In [None]:
model = LDA_In_VB(df_tweet, stop_word=stop_word, topic_num=10)
model.fit()

print(f'topic of Variational Bayes algorithm: finished')

In [None]:
pd_θ, pd_Φ = model.stats_info()
pd_θ.to_parquet('./parquet_data/20250227_tweet_VB_θ.parquet')
pd_Φ.to_parquet('./parquet_data/20250227_tweet_VB_Φ.parquet')

In [None]:
model = LDA_In_CGS(df_tweet, stop_word=stop_word, topic_num=10)
model.fit()

print(f'topic of Variational Bayes algorithm: finished')

In [None]:
pd_θ, pd_Φ = model.stats_info()
pd_θ.to_parquet('./parquet_data/20250227_tweet_CGS_θ.parquet')
pd_Φ.to_parquet('./parquet_data/20250227_tweet_CGS_Φ.parquet')