In [1]:
# 🧭 05-eda-reviews.ipynb

# -------------------------------
# 🛠️ Setup & Imports
# -------------------------------
import sys
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textwrap import wrap

# Add utility path
sys.path.append(str((Path().resolve().parent.parent / "src" / "utils").resolve()))
from load_data import load_csv, save_csv

# -------------------------------
# 📥 Load Raw Data
# -------------------------------
df = load_csv("reviews.csv")
df.head()

# -------------------------------
# 🔍 Initial Exploration
# -------------------------------
df.info()
df.describe()
df['rating'].value_counts()

# Null checks
df.isna().sum()

# -------------------------------
# 🧹 Data Cleaning
# -------------------------------

# Ensure types
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')

# Clean text fields
df['review_text'] = df['review_text'].astype(str).str.strip()

# Drop incomplete records
df = df.dropna(subset=['customer_id', 'product_id', 'rating', 'review_date'])

# -------------------------------
# 🧠 Feature Engineering
# -------------------------------

# Review length
df['review_length'] = df['review_text'].str.len()

# Review sentiment (basic proxy)
df['sentiment'] = df['rating'].apply(lambda x: 'positive' if x >= 4 else ('negative' if x <= 2 else 'neutral'))

# Join with product metadata
products_df = load_csv("products_clean.csv")
df = df.merge(products_df[['product_id', 'category', 'brand']], on='product_id', how='left')

# -------------------------------
# 📊 Exploratory Data Analysis
# -------------------------------

# Rating distribution
sns.countplot(x='rating', data=df)
plt.title("Review Ratings Distribution")
plt.show()

# Sentiment breakdown
sns.countplot(x='sentiment', data=df, palette='Set2')
plt.title("Sentiment Distribution")
plt.show()

# Average rating by category
cat_rating = df.groupby('category')['rating'].mean().sort_values(ascending=False)
cat_rating.plot(kind='barh', figsize=(8, 6), title='Avg Rating by Category')
plt.xlabel("Avg Rating")
plt.show()

# Review length histogram
sns.histplot(df['review_length'], bins=40)
plt.title("Review Length Distribution")
plt.xlabel("Characters")
plt.show()

# Word cloud for positive reviews
text = ' '.join(df[df['sentiment'] == 'positive']['review_text'].dropna().values)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Positive Review Word Cloud")
plt.show()

# -------------------------------
# 💾 Save Cleaned File
# -------------------------------
save_csv(df, "reviews_clean.csv")


ModuleNotFoundError: No module named 'wordcloud'