In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import json

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
INPUT_FILE = 'Cell_Phones_and_Accessories_5.json' 
OUTPUT_FILE = 'cleaned_amazon_reviews.csv'

In [None]:
def load_data(file_path):
    data = []
    print(f"Starting to load {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            try:
                review = json.loads(line)
                
                if 'reviewText' in review and 'overall' in review:
                    data.append({
                        'text': review['reviewText'],
                        'stars': review['overall']
                    })
            except json.JSONDecodeError:
                print(f"Warning: Skipping JSON on line {i}")
                continue
                
    print(f"Loaded {len(data)} reviews.")
    return pd.DataFrame(data)

In [None]:
def create_multiclass_label(stars):
    stars = float(stars)
    if stars in [1.0, 2.0]:
        return 0  
    elif stars == 3.0:
        return 1
    elif stars in [4.0, 5.0]:
        return 2 
    else:
        return None

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    
    text = text.lower()
    
    tokens = text.split()
    
    cleaned_tokens = []
    for tok in tokens:
        if tok not in stop_words:
            cleaned_tokens.append(lemmatizer.lemmatize(tok))
        
    return " ".join(cleaned_tokens)

In [None]:

if __name__ == "__main__":
    
    df = load_data(INPUT_FILE)
    
    print("Converting star ratings to sentiment labels...")
    df['sentiment'] = df['stars'].apply(create_multiclass_label)
    
    df = df.dropna(subset=['text', 'sentiment'])
    df['sentiment'] = df['sentiment'].astype(int) 
    
    print("Starting text preprocessing...")
    df['cleaned_text'] = df['text'].apply(clean_text)
    print("Text preprocessing complete.")

    final_df = df[['cleaned_text', 'sentiment']]

    final_df.to_csv(OUTPUT_FILE, index=False)
    
    print(f"\n--- Success ---")
    print(f"Clean data saved to {OUTPUT_FILE}")
    print("\nHead of the new file:")
    print(final_df.head())