In [6]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pathlib import Path
import random
from concurrent.futures import ProcessPoolExecutor
import time
from collections import defaultdict
from itertools import combinations
import math
from collections import defaultdict

In [7]:
# 1. Folder input (preprocessed per tahun) & folder output
INPUT_DIR = Path('./../../data/preprocessing/lemmatization')
OUTPUT_DIR = Path('./../../output/lda_lemmatization')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [8]:
# 2. Inisialisasi Gibbs Sampling LDA (manual)
def lda_gibbs(docs, K=3, alpha=0.1, beta=0.01, iterations=500):
    # Bangun vocab & mapping
    vocab   = sorted({w for doc in docs for w in doc})
    W       = len(vocab)
    word2id = {w:i for i,w in enumerate(vocab)}
    D       = len(docs)

    # Hitung prior + init matriks count
    doc_topic  = np.zeros((D,K), dtype=int)   # Nd,k
    topic_word = np.zeros((K,W), dtype=int)   # Nk,w
    topic_sum  = np.zeros(K, dtype=int)       # Nk,.
    Z          = []                           # assignments

    # Inisialisasi acak
    for d, doc in enumerate(docs):
        z_d = []
        for w in doc:
            t = random.randrange(K)
            z_d.append(t)
            doc_topic[d,t]    += 1
            topic_word[t,word2id[w]] += 1
            topic_sum[t]      += 1
        Z.append(z_d)

    # Iterasi Gibbs Sampling
    for it in range(iterations):
        for d, doc in enumerate(docs):
            for i, w in enumerate(doc):
                t_old = Z[d][i]
                wid   = word2id[w]

                # decrement
                doc_topic[d,t_old]      -= 1
                topic_word[t_old,wid]   -= 1
                topic_sum[t_old]        -= 1

                # hitung p(z = k)
                p_z = (doc_topic[d] + alpha) * \
                      (topic_word[:,wid] + beta) / \
                      (topic_sum + beta*W)

                p_z = p_z / p_z.sum()
                t_new = np.random.choice(K, p=p_z)

                # increment
                Z[d][i]                   = t_new
                doc_topic[d,t_new]       += 1
                topic_word[t_new,wid]    += 1
                topic_sum[t_new]         += 1

    # Ekstrak top-10 kata per topik
    top_words = {
        k: [vocab[i] for i in topic_word[k].argsort()[-10:][::-1]]
        for k in range(K)
    }
    return top_words, vocab


In [9]:
def compute_coherence(top_words, docs, epsilon=1e-12):
    """
    Menghitung coherence score (UMass-like) secara manual.
    top_words : dict of {topic_id: [word1, word2, ..., wordN]}
    docs      : list of list of tokens
    """
    # Hitung dokumen yang mengandung setiap kata
    doc_freq = defaultdict(int)
    for doc in docs:
        unique_words = set(doc)
        for word in unique_words:
            doc_freq[word] += 1

    # Hitung coherence tiap topik
    coherence_scores = []

    for topic, words in top_words.items():
        score = 0.0
        for i in range(1, len(words)):
            for j in range(i):
                w_i = words[i]
                w_j = words[j]
                # Hitung co-occurence di dokumen
                D_wi = doc_freq.get(w_i, 0)
                D_wi_wj = 0
                for doc in docs:
                    if w_i in doc and w_j in doc:
                        D_wi_wj += 1

                # UMass-like: log( (D(w_i, w_j) + epsilon) / D(w_j) )
                score += math.log((D_wi_wj + epsilon) / (doc_freq.get(w_j, 1)))
        coherence_scores.append(score)

    # Rata-rata antar topik
    return sum(coherence_scores) / len(coherence_scores)


In [10]:
# 3. Loop per file JSON (per tahun)
for path in sorted(INPUT_DIR.glob("preprocessed_abstracts_lemmatization_*.json")):
    year = path.stem.split("_")[-1]
    out_year_dir = OUTPUT_DIR / year
    out_year_dir.mkdir(parents=True, exist_ok=True)
    print(f"\n→ Memproses tahun {year}")

    # Baca list dict, ambil saja abstrak (string)
    with open(path, 'r', encoding='utf-8') as f:
        records = json.load(f)

    # Tokenisasi: setiap string split on whitespace
    docs = [ rec["abstract"].split() 
             for rec in records 
             if rec.get("abstract","").strip() ]

    if not docs:
        print(f"  ⚠️  Tidak ada abstrak valid untuk {year}, dilewati.")
        continue
        
    start_time = time.time()
    
    top_words, vocab = lda_gibbs(docs, K=3, iterations=100)

    coherence = compute_coherence(top_words, docs)
    print(f"✔️ Coherence Score (UMass-like) untuk {year}: {coherence:.4f}")

    # 4a. Bar chart per topik
    for k, words in top_words.items():
        freqs = [sum(doc.count(w) for doc in docs) for w in words]
        plt.figure(figsize=(6, 4))
        plt.barh(words[::-1], freqs[::-1])
        plt.title(f"[{year}] Topik #{k+1}")
        plt.xlabel("Frekuensi")
        plt.tight_layout()
        plt.savefig(out_year_dir / f"bar_topic{(k+1):02d}_{year}.png")
        plt.close()
    
    # 4b. Word Cloud per topik
        wc = WordCloud(width=800, height=400, background_color='white')
        wc.generate_from_frequencies(dict(zip(words, freqs)))
        wc.to_file(out_year_dir / f"wordcloud_topic{(k+1):02d}_{year}.png")

    end_time = time.time()
    print(f"⏱️ Waktu proses {year}: {end_time - start_time:.2f} detik")


→ Memproses tahun 2013
✔️ Coherence Score (UMass-like) untuk 2013: -112.1953
⏱️ Waktu proses 2013: 18.73 detik

→ Memproses tahun 2014
✔️ Coherence Score (UMass-like) untuk 2014: -105.5897
⏱️ Waktu proses 2014: 38.78 detik

→ Memproses tahun 2015
✔️ Coherence Score (UMass-like) untuk 2015: -77.1048
⏱️ Waktu proses 2015: 75.01 detik

→ Memproses tahun 2016
✔️ Coherence Score (UMass-like) untuk 2016: -55.7656
⏱️ Waktu proses 2016: 173.25 detik

→ Memproses tahun 2017
✔️ Coherence Score (UMass-like) untuk 2017: -48.7339
⏱️ Waktu proses 2017: 907.14 detik

→ Memproses tahun 2018
✔️ Coherence Score (UMass-like) untuk 2018: -48.5206
⏱️ Waktu proses 2018: 2086.15 detik

→ Memproses tahun 2019


KeyboardInterrupt: 