In [1]:
import json
from pathlib import Path
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import time

In [2]:
cleaned_folder = Path('./../../data/cleaned')
output_folder = Path('./../../data/preprocessing/stemming')
output_folder.mkdir(parents=True, exist_ok=True)

In [3]:
import nltk
from nltk.tokenize import word_tokenize

# Tambahkan path ke resource lokal
nltk.data.path.append('Stemming-vs-Lemmatization-on-LDA-Topic-Modelling/envtextminning/lib/python3.10/site-packages/nltk_data')

# Download jika belum tersedia
nltk.download('punkt_tab', download_dir='Stemming-vs-Lemmatization-on-LDA-Topic-Modelling/envtextminning/lib/python3.10/site-packages/nltk_data')
nltk.download('stopwords', download_dir='Stemming-vs-Lemmatization-on-LDA-Topic-Modelling/envtextminning/lib/python3.10/site-packages/nltk_data')


[nltk_data] Downloading package punkt_tab to Stemming-vs-
[nltk_data]     Lemmatization-on-LDA-Topic-
[nltk_data]     Modelling/envtextminning/lib/python3.10/site-
[nltk_data]     packages/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to Stemming-vs-
[nltk_data]     Lemmatization-on-LDA-Topic-
[nltk_data]     Modelling/envtextminning/lib/python3.10/site-
[nltk_data]     packages/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
punct_table = str.maketrans('', '', string.punctuation)

In [5]:
def preprocess_stemming(text):
    tokens = word_tokenize(text.lower())
    tokens = [t.translate(punct_table) for t in tokens]          # hapus tanda baca
    tokens = [t for t in tokens if t.isalpha()]                  # hanya huruf
    tokens = [t for t in tokens if t not in stop_words]          # hapus stopword
    stems  = [stemmer.stem(t) for t in tokens]                   # stemming
    return stems

In [6]:
# Loop semua file JSON cleaned abstracts
for file_path in cleaned_folder.glob('cleaned_abstracts_*.json'):
    print(f'🔍 Processing file: {file_path.name}')
    
    try:
        year = file_path.stem.split('_')[-1]
    except IndexError:
        print(f"⚠️  Skipping file {file_path.name} (invalid format)")
        continue

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print(f"❌ Error reading {file_path.name}: {e}")
        continue

    processed_abstracts = []
    start_time = time.time()
    for item in data:
        abs_text = item.get('abstract', '')
        if isinstance(abs_text, str) and abs_text.strip():
            try:
                stems = preprocess_stemming(abs_text)
                if isinstance(stems, list):
                    processed_text = " ".join(stems)
                    processed_abstracts.append({
                        "year": year,
                        "abstract": processed_text
                    })
            except Exception as e:
                print(f"⚠️  Error processing abstract: {e}")
                continue
                
     # Simpan hasil
    out_file = output_folder / f'preprocessed_abstracts_stemming_{year}.json'
    try:
        with open(out_file, 'w', encoding='utf-8') as fout:
            json.dump(processed_abstracts, fout, indent=2)
        elapsed = time.time() - start_time
        print(f"✅ Saved {len(processed_abstracts)} abstracts for year {year} to {out_file}")
        print(f"⏱️ Waktu proses {year}: {elapsed:.2f} detik\n")
    except Exception as e:
        print(f"❌ Error saving file {out_file}: {e}")

🔍 Processing file: cleaned_abstracts_2015.json
✅ Saved 241 abstracts for year 2015 to ../../data/preprocessing/stemming/preprocessed_abstracts_stemming_2015.json
⏱️ Waktu proses 2015: 1.75 detik

🔍 Processing file: cleaned_abstracts_2014.json
✅ Saved 121 abstracts for year 2014 to ../../data/preprocessing/stemming/preprocessed_abstracts_stemming_2014.json
⏱️ Waktu proses 2014: 0.87 detik

🔍 Processing file: cleaned_abstracts_2025.json
✅ Saved 800 abstracts for year 2025 to ../../data/preprocessing/stemming/preprocessed_abstracts_stemming_2025.json
⏱️ Waktu proses 2025: 6.91 detik

🔍 Processing file: cleaned_abstracts_2018.json
✅ Saved 6102 abstracts for year 2018 to ../../data/preprocessing/stemming/preprocessed_abstracts_stemming_2018.json
⏱️ Waktu proses 2018: 46.22 detik

🔍 Processing file: cleaned_abstracts_2019.json
✅ Saved 10000 abstracts for year 2019 to ../../data/preprocessing/stemming/preprocessed_abstracts_stemming_2019.json
⏱️ Waktu proses 2019: 77.91 detik

🔍 Processing fi