In [28]:
import json
import os
import re
import string
import time
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from pathlib import Path
from spacy.lang.en.stop_words import STOP_WORDS

In [29]:
try:
    spacy.require_gpu()
    print("✅ GPU digunakan.")
except Exception as e:
    print("⚠️ GPU tidak tersedia. Menggunakan CPU.")

⚠️ GPU tidak tersedia. Menggunakan CPU.


In [30]:
import nltk
from nltk.tokenize import word_tokenize

# Tambahkan path ke resource lokal
nltk.data.path.append('Stemming-vs-Lemmatization-on-LDA-Topic-Modelling/envtextminning/lib/python3.10/site-packages/nltk_data')

# Download jika belum tersedia
nltk.download('punkt_tab', download_dir='Stemming-vs-Lemmatization-on-LDA-Topic-Modelling/envtextminning/lib/python3.10/site-packages/nltk_data')
nltk.download('stopwords', download_dir='Stemming-vs-Lemmatization-on-LDA-Topic-Modelling/envtextminning/lib/python3.10/site-packages/nltk_data')



[nltk_data] Downloading package punkt_tab to Stemming-vs-
[nltk_data]     Lemmatization-on-LDA-Topic-
[nltk_data]     Modelling/envtextminning/lib/python3.10/site-
[nltk_data]     packages/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to Stemming-vs-
[nltk_data]     Lemmatization-on-LDA-Topic-
[nltk_data]     Modelling/envtextminning/lib/python3.10/site-
[nltk_data]     packages/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
# Load model SpaCy
nlp = spacy.load("en_core_web_sm")
stop_words = set(STOP_WORDS)

In [32]:
def basic_preprocess(text):
    if not isinstance(text, str):
        return []
    # Lowercase dan bersihkan whitespace
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    # Hapus angka
    text = re.sub(r'\d+', '', text)
    # Hapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenisasi
    tokens = word_tokenize(text)
    # Hapus stopwords dan kata pendek (<=2)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

In [33]:
def preprocess_lemmatization(text):
    cleaned_tokens = basic_preprocess(text)  
    if not cleaned_tokens:
        return []
    doc = nlp(" ".join(cleaned_tokens))
    lemmas = [token.lemma_ for token in doc if token.lemma_ not in stop_words and len(token.lemma_) > 2]
    return lemmas


In [34]:
# Path folder input cleaned abstracts
cleaned_folder = Path('./../../data/cleaned')
# Path folder output preprocessing lemmatization
output_folder = Path('./../../data/preprocessing/lemmatization')
output_folder.mkdir(parents=True, exist_ok=True)

In [35]:
total_start_time = time.time()

In [25]:
# List semua file di folder cleaned yang namanya diawali 'cleaned_abstracts_'
files = [f for f in os.listdir(cleaned_folder) if f.startswith('cleaned_abstracts_') and f.endswith('.json')]

In [37]:
for file_path in cleaned_folder.glob('cleaned_abstracts_*.json'):
   # Ambil tahun dari nama file
    try:
        year = file_path.stem.split('_')[-1]
    except IndexError:
        print(f"❌ Format nama file salah: {file_path.name}")
        continue

    # Load file JSON
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print(f"❌ Gagal membaca file {file_path.name}: {e}")
        continue

    processed_abstracts = []
    start_time = time.time()

    for idx, item in enumerate(data):
        abs_text = item.get('abstract', '')
        if isinstance(abs_text, str) and abs_text.strip():
            try:
                lemmas = preprocess_lemmatization(abs_text)
                processed_text = " ".join(lemmas)
                processed_abstracts.append({
                    "year": year,
                    "abstract": processed_text
                })
            except Exception as e:
                print(f"⚠️ Error saat memproses abstrak #{idx + 1}: {e}")

    end_time = time.time()

    # Simpan hasil
    out_file = output_folder / f'preprocessed_abstracts_lemmatization_{year}.json'
    try:
        with open(out_file, 'w', encoding='utf-8') as fout:
            json.dump(processed_abstracts, fout, indent=2)
        print(f"✅ Disimpan: {len(processed_abstracts)} abstrak tahun {year} -> {out_file}")
        print(f"⏱️ Waktu proses {year}: {end_time - start_time:.2f} detik")
    except Exception as e:
        print(f"❌ Gagal menyimpan file {out_file}: {e}")


✅ Disimpan: 241 abstrak tahun 2015 -> ../../data/preprocessing/lemmatization/preprocessed_abstracts_lemmatization_2015.json
⏱️ Waktu proses 2015: 4.79 detik
✅ Disimpan: 121 abstrak tahun 2014 -> ../../data/preprocessing/lemmatization/preprocessed_abstracts_lemmatization_2014.json
⏱️ Waktu proses 2014: 2.17 detik
✅ Disimpan: 800 abstrak tahun 2025 -> ../../data/preprocessing/lemmatization/preprocessed_abstracts_lemmatization_2025.json
⏱️ Waktu proses 2025: 16.11 detik
✅ Disimpan: 6102 abstrak tahun 2018 -> ../../data/preprocessing/lemmatization/preprocessed_abstracts_lemmatization_2018.json
⏱️ Waktu proses 2018: 99.99 detik
✅ Disimpan: 10000 abstrak tahun 2019 -> ../../data/preprocessing/lemmatization/preprocessed_abstracts_lemmatization_2019.json
⏱️ Waktu proses 2019: 167.25 detik
✅ Disimpan: 6624 abstrak tahun 2023 -> ../../data/preprocessing/lemmatization/preprocessed_abstracts_lemmatization_2023.json
⏱️ Waktu proses 2023: 122.98 detik
✅ Disimpan: 66 abstrak tahun 2013 -> ../../data/