In [2]:
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw_indo = stopwords.words('indonesian') + list(punctuation)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/habibmudafiq/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/habibmudafiq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import Data

In [3]:
df = pd.read_csv("data/kompas.csv")
df.head()

Unnamed: 0,teks
0,Ginandjar Tetap Ditahan. Jaksa Agung Dilaporka...
1,Jakarta Dikangkangi Para Preman\nKALAU tak pun...
2,Penyimpangan di Setpres Seolah Terjadi Sekaran...
3,"Dibayarkan, Rapel Kenaikan Gaji Pegawai Pos\nK..."
4,"Stop Kekerasan, Elite agar Duduk Bersama\nSeju..."


# Extract BoW

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
bow = CountVectorizer(
    ngram_range=(1, 2),
    tokenizer=word_tokenize,
    stop_words=sw_indo,
    min_df=5
)

bow_matrix = bow.fit_transform(df.teks)




# Topic Modeling

In [6]:
vocab = bow.get_feature_names()



# LSA

In [7]:
from sklearn.decomposition import TruncatedSVD

In [8]:
lsa = TruncatedSVD(n_components=10, n_iter=10, random_state=42)
lsa_matrix = lsa.fit_transform(bow_matrix)

In [9]:
def get_topic(model):
    return [
        [vocab[idx] for idx in reversed(comp.argsort()[-6:]) if vocab[idx].isalnum()]
        for comp in model.components_]


In [10]:
get_topic(lsa)

[['presiden', 'indonesia', 'pemerintah', 'dpr'],
 ['presiden', 'dpr', 'ketua', 'partai', 'mpr', 'tandjung'],
 ['pemerintah', 'rp', 'indonesia', 'bank', 'persen', 'utang'],
 ['rp', 'tandjung', 'dana', 'bulog', 'hukum', 'harga'],
 ['presiden', 'air', 'banjir', 'harga', 'rp', 'dpr'],
 ['harga', 'beras', 'rp', 'bbm'],
 ['mpr', 'konstitusi', 'bppn', 'uud'],
 ['indonesia', 'mpr', 'konstitusi', 'uud', 'perubahan', '1945'],
 ['pemerintah', 'dpr', 'israel', 'bppn', 'kota', 'aceh'],
 ['massa', 'rupiah', 'bunga', 'mpr', 'bank', 'suku']]

# LDA

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
lda = LatentDirichletAllocation(n_components=10, max_iter=10, random_state=42)
lda_matrix = lsa.fit_transform(bow_matrix)

In [None]:
get_topic(lda)