In [10]:
import pandas as pd
import json
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [11]:
raw_path = './../../data/raw/abstracts_only.json'
clean_path = './../../data/cleaned/cleaned_abstracts_stemming.json'

In [12]:
with open(raw_path, "r", encoding="utf-8") as f:
    data = json.load(f)

In [13]:
df = pd.DataFrame(data)

if 'abstract' not in df.columns:
    raise KeyError("'abstract' column not found in JSON.")

In [14]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

In [18]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    stemmed_tokens = [
        stemmer.stem(word)
        for word in tokens
        if word.isalpha() and word not in stop_words
    ]
    return " ".join(stemmed_tokens)

In [19]:
df['cleaned_abstract'] = df['abstract'].apply(preprocess_text)

In [8]:
print("\n--- Before and After Preprocessing ---")
for i, row in df.head(5).iterrows():
    print(f"\nData {i+1}:")
    print("Raw       :", row['abstract'])
    print("Cleaned   :", row['cleaned_abstract'])


--- Before and After Preprocessing ---

Data 1:
Raw       : Technical progress in the open-source self replicating rapid prototyper (RepRap) community has enabled a distributed form of additive manufacturing to expand rapidly using polymer-based materials. However, the lack of an open-source metal alternative and the high capital costs and slow throughput of proprietary commercialized metal 3-D printers has severely restricted their deployment. The applications of commercialized metal 3-D printers are limited to only rapid prototyping and expensive finished products. This severely restricts the access of the technology for small and medium enterprises, the developing world and for use in laboratories. This paper reports on the development of a<$2000open-source metal 3-D printer. The metal 3-D printer is controlled with an open-source micro-controller and is a combination of a low-cost commercial gas-metal arc welder and a derivative of the Rostock, a deltabot RepRap. The bill of mater

In [20]:
os.makedirs(os.path.dirname(clean_path), exist_ok=True)
df[['cleaned_abstract']].to_json(clean_path, orient="records", lines=False, indent=2)

print("Preprocessing finished, saved to:", clean_path)

Preprocessing finished, saved to: ./../../data/cleaned/cleaned_abstracts_stemming.json
