In [None]:
import pandas as pd
import json
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
raw_path = './../../data/raw/abstracts_only.json'
clean_path = './../../data/cleaned/cleaned_abstracts_stemming.json'

In [None]:
with open(raw_path, "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
df = pd.DataFrame(data)

if 'abstract' not in df.columns:
    raise KeyError("'abstract' column not found in JSON.")

In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

In [None]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    stemmed_tokens = [
        stemmer.stem(word)
        for word in tokens
        if word.isalpha() and word not in stop_words
    ]
    return " ".join(stemmed_tokens)

In [None]:
df['cleaned_abstract'] = df['abstract'].apply(preprocess_text)

In [None]:
print("\n--- Before and After Preprocessing ---")
for i, row in df.head(5).iterrows():
    print(f"\nData {i+1}:")
    print("Raw       :", row['abstract'])
    print("Cleaned   :", row['cleaned_abstract'])

In [None]:
os.makedirs(os.path.dirname(clean_path), exist_ok=True)
df[['cleaned_abstract']].to_json(clean_path, orient="records", lines=False, indent=2)

print("Preprocessing finished, saved to:", clean_path)