In [5]:
import pandas as pd
import numpy as np
import os

# Create folders if they don't exist
os.makedirs("../data/processed", exist_ok=True)

# Load raw dataset
df = pd.read_csv("../data/raw/Womens Clothing E-Commerce Reviews.csv")

# Keep relevant columns
df = df[['Review Text', 'Rating']]
df.rename(columns={'Review Text': 'review_text', 'Rating': 'rating'}, inplace=True)

# Drop missing reviews
df.dropna(subset=['review_text'], inplace=True)

# Map rating to sentiment
def map_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return np.nan

df['sentiment'] = df['rating'].apply(map_sentiment)
df.dropna(subset=['sentiment'], inplace=True)

# Optional: check label distribution
print("Sentiment distribution:")
print(df['sentiment'].value_counts())

# Save cleaned dataset
df.to_csv("../data/processed/clean_reviews.csv", index=False)
print("Clean data saved as: ../data/processed/clean_reviews.csv")


Sentiment distribution:
sentiment
positive    17448
negative     2370
Name: count, dtype: int64
Clean data saved as: ../data/processed/clean_reviews.csv
