## Data Preparation

concatenating all files into a single file

In [1]:
import os
import re
import glob
from pprint import pprint

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pandas as pd

import gensim
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim

In [2]:
DATA_PATH = "../data"

In [3]:
file_paths = glob.glob(os.path.join(DATA_PATH, "*", "Jan-Sep-2020.csv"))

In [4]:
file_paths = file_paths + ["../data/detik/news_01_Jan_2020-30_Sep_2020.csv"]

In [5]:
d_all = pd.DataFrame([])

In [6]:
file_paths

['../data/kompas/Jan-Sep-2020.csv',
 '../data/liputan6/Jan-Sep-2020.csv',
 '../data/tribunnews/Jan-Sep-2020.csv',
 '../data/cnnindonesia/Jan-Sep-2020.csv',
 '../data/detik/news_01_Jan_2020-30_Sep_2020.csv']

In [7]:
for file in file_paths:
    d_data = pd.read_csv(file)
    d_data.columns = ["title", "url", "date"]
    d_all = pd.concat((d_all, d_data), axis=0, ignore_index=True)

In [8]:
d_all.shape

(699666, 3)

In [10]:
# # uncomment this to save restructed dataset
# d_all.to_json("../data/dataset/jan_sep_2020.json", orient="records")

In [11]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [12]:
d_selected = d_all.loc[:9999]

In [13]:
d_selected.isna().sum()

title    0
url      0
date     0
dtype: int64

In [14]:
d_selected = d_selected.assign(title_stem = d_selected.title.apply(stemmer.stem))

In [15]:
d_selected.head()

Unnamed: 0,title,url,date,title_stem
0,Paslon Jekek-Setyo Pilih Tak Sengketakan Kata ...,https://regional.kompas.com/read/2020/09/30/23...,2020-09-30,paslon jekek-setyo pilih tak sengketa kata nya...
1,Mobil Terbakar di Depan SPBU Kota Kediri,https://regional.kompas.com/read/2020/09/30/23...,2020-09-30,mobil bakar di depan spbu kota diri
2,Desa Hargobinangun Sleman Lahirkan Petani Muda,https://regional.kompas.com/read/2020/09/30/23...,2020-09-30,desa hargobinangun sleman lahir tani muda
3,"Kisah RFZ, Bocah yang Disiksa Ayah dan Dibuang...",https://regional.kompas.com/read/2020/09/30/23...,2020-09-30,kisah rfz bocah yang siksa ayah dan buang ibu ...
4,Bio Farma Sebut Tak Ditemukan Efek Samping dar...,https://nasional.kompas.com/read/2020/09/30/23...,2020-09-30,bio farma sebut tak temu efek samping dari suk...


In [16]:
d_selected = d_selected.assign(title_filtered = d_selected.title_stem.apply(lambda x: " ".join([w for w in x.split() if len(w) > 2])))

In [17]:
d_selected.head()

Unnamed: 0,title,url,date,title_stem,title_filtered
0,Paslon Jekek-Setyo Pilih Tak Sengketakan Kata ...,https://regional.kompas.com/read/2020/09/30/23...,2020-09-30,paslon jekek-setyo pilih tak sengketa kata nya...,paslon jekek-setyo pilih tak sengketa kata nya...
1,Mobil Terbakar di Depan SPBU Kota Kediri,https://regional.kompas.com/read/2020/09/30/23...,2020-09-30,mobil bakar di depan spbu kota diri,mobil bakar depan spbu kota diri
2,Desa Hargobinangun Sleman Lahirkan Petani Muda,https://regional.kompas.com/read/2020/09/30/23...,2020-09-30,desa hargobinangun sleman lahir tani muda,desa hargobinangun sleman lahir tani muda
3,"Kisah RFZ, Bocah yang Disiksa Ayah dan Dibuang...",https://regional.kompas.com/read/2020/09/30/23...,2020-09-30,kisah rfz bocah yang siksa ayah dan buang ibu ...,kisah rfz bocah yang siksa ayah dan buang ibu ...
4,Bio Farma Sebut Tak Ditemukan Efek Samping dar...,https://nasional.kompas.com/read/2020/09/30/23...,2020-09-30,bio farma sebut tak temu efek samping dari suk...,bio farma sebut tak temu efek samping dari suk...


In [18]:
titles = d_selected.title_filtered

In [19]:
titles = [t.split() for t in titles]

In [20]:
len(titles)

10000

In [21]:
titles[0]

['paslon',
 'jekek-setyo',
 'pilih',
 'tak',
 'sengketa',
 'kata',
 'nyawiji',
 'bawaslu']

In [22]:
id2word = Dictionary(titles)

In [23]:
help(id2word)

Help on Dictionary in module gensim.corpora.dictionary object:

class Dictionary(gensim.utils.SaveLoad, collections.abc.Mapping)
 |  Dictionary(documents=None, prune_at=2000000)
 |  
 |  Dictionary encapsulates the mapping between normalized words and their integer ids.
 |  
 |  Notable instance attributes:
 |  
 |  Attributes
 |  ----------
 |  token2id : dict of (str, int)
 |      token -> tokenId.
 |  id2token : dict of (int, str)
 |      Reverse mapping for token2id, initialized in a lazy manner to save memory (not created until needed).
 |  cfs : dict of (int, int)
 |      Collection frequencies: token_id -> how many instances of this token are contained in the documents.
 |  dfs : dict of (int, int)
 |      Document frequencies: token_id -> how many documents contain this token.
 |  num_docs : int
 |      Number of documents processed.
 |  num_pos : int
 |      Total number of corpus positions (number of processed words).
 |  num_nnz : int
 |      Total number of non-zeroes in th

In [24]:
corpus = [id2word.doc2bow(text) for text in titles]

In [25]:
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]]


In [None]:
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

In [31]:
# Build LDA model
#
# Paper that is used in gensim LDA Model 
# https://papers.nips.cc/paper/2010/file/71f6278d140af599e06ad9bf1ba03cb0-Paper.pdf

lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15, 
                     random_state=123,
                     chunksize=10000,
                     alpha='auto',
                     per_word_topics=True
                )

In [32]:
pprint(lda_model.print_topics())

[(0,
  '0.027*"covid-19" + 0.026*"dan" + 0.015*"yang" + 0.013*"kasus" + '
  '0.010*"pilkada" + 0.009*"polisi" + 0.009*"orang" + 0.008*"positif" + '
  '0.007*"jakarta" + 0.007*"tak"'),
 (1,
  '0.018*"covid-19" + 0.009*"indonesia" + 0.009*"kasus" + 0.008*"tak" + '
  '0.008*"dan" + 0.008*"ini" + 0.007*"minta" + 0.007*"update" + 0.006*"pasien" '
  '+ 0.006*"dari"'),
 (2,
  '0.015*"covid-19" + 0.015*"yang" + 0.015*"dan" + 0.009*"polisi" + '
  '0.008*"orang" + 0.007*"warga" + 0.007*"ini" + 0.007*"tinggal" + '
  '0.007*"psbb" + 0.006*"positif"'),
 (3,
  '0.040*"covid-19" + 0.014*"dan" + 0.014*"positif" + 0.012*"2020" + '
  '0.011*"pilkada" + 0.010*"ini" + 0.010*"tak" + 0.010*"kpu" + 0.010*"yang" + '
  '0.009*"dari"'),
 (4,
  '0.018*"covid-19" + 0.011*"yang" + 0.009*"pilkada" + 0.009*"ini" + '
  '0.008*"untuk" + 0.007*"rumah" + 0.006*"dan" + 0.006*"minta" + 0.005*"kasus" '
  '+ 0.005*"saat"'),
 (5,
  '0.026*"dan" + 0.024*"covid-19" + 0.013*"yang" + 0.011*"kasus" + '
  '0.010*"pilkada" + 0.010*

In [33]:
doc_lda = lda_model[corpus]

In [34]:
pyLDAvis.enable_notebook()

In [37]:
p = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
p

In [36]:
from IPython.display import HTML
css_str = '<style> \
.jp-Button path { fill: #616161;} \
text.terms { fill: #616161;} \
</style>'
display(HTML(css_str ))

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=titles, dictionary=id2word, coherence='c_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)