## Data Preparation

concatenating all files into a single file

In [None]:
import os
import re
import glob
from pprint import pprint

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pandas as pd

import gensim
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim

In [None]:
DATA_PATH = "../data"

In [None]:
file_paths = glob.glob(os.path.join(DATA_PATH, "*", "Jan-Sep-2020.csv"))

In [None]:
file_paths = file_paths + ["../data/detik/news_01_Jan_2020-30_Sep_2020.csv"]

In [None]:
d_all = pd.DataFrame([])

In [None]:
file_paths

In [None]:
for file in file_paths:
    d_data = pd.read_csv(file)
    d_data.columns = ["title", "url", "date"]
    d_all = pd.concat((d_all, d_data), axis=0, ignore_index=True)

In [None]:
d_all.shape

In [None]:
# uncomment this to save restructed dataset
# d_all.to_json("../data/dataset/jan_sep_2020.json", orient="records")

In [None]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
d_selected = d_all.loc[:9999]

In [None]:
d_selected.isna().sum()

In [None]:
d_selected = d_selected.assign(title_stem = d_selected.title.apply(stemmer.stem))

In [None]:
d_selected.head()

In [None]:
d_selected = d_selected.assign(title_filtered = d_selected.title_stem.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2])))

In [None]:
d_selected.head()

In [None]:
titles = d_selected.title_filtered

In [None]:
titles = [t.split() for t in titles]

In [None]:
len(titles)

In [None]:
titles[0]

In [None]:
id2word = Dictionary(titles)

In [None]:
help(id2word)

In [None]:
corpus = [id2word.doc2bow(text) for text in titles]

In [None]:
print(corpus[:1])

In [None]:
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

In [None]:
# Build LDA model
#
# Paper that is used in gensim LDA Model 
# https://papers.nips.cc/paper/2010/file/71f6278d140af599e06ad9bf1ba03cb0-Paper.pdf

lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=8, 
                     random_state=123,
                     chunksize=10000,
                     alpha='auto',
                     per_word_topics=True
                )

In [None]:
pprint(lda_model.print_topics())

In [None]:
doc_lda = lda_model[corpus]

In [None]:
pyLDAvis.enable_notebook()

In [None]:
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
p

In [None]:
from IPython.display import HTML
css_str = '<style> \
.jp-Button path { fill: #616161;} \
text.terms { fill: #616161;} \
</style>'
display(HTML(css_str ))

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=titles, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)