# üßπ Data Cleaning ‚Äî Amazon Sales Dataset

This notebook identifies data quality issues, exports bad rows to a separate CSV, fixes the main dataset, and re-runs validation.

**Issues Found:**
| Issue | Count | Fix |
|-------|------:|-----|
| Empty `currency` | 7,795 | Fill with `"INR"` |
| Empty `Amount` | 7,795 | Fill with `0.0` (cancelled orders) |
| Empty `ship-country` | 33 | Fill with `"IN"` |
| Duplicate `Order ID` | 15,443 | Multi-item orders ‚Üí remove unique constraint |
| Unknown `Status` values | 295 | Add missing statuses to valid list |

## 1. Load Data

In [None]:
import pandas as pd
import importlib

CSV_PATH = "data/amazon_sales.csv"
BAD_ROWS_PATH = "data/bad_rows.csv"

df = pd.read_csv(CSV_PATH, low_memory=False)
print(f"Rows: {len(df):,}  |  Columns: {len(df.columns)}")
df.head()

## 2. Inspect Data Quality Issues

In [None]:
# Null counts for key columns
key_cols = ["Order ID", "Date", "Status", "Fulfilment", "currency", "Qty", "Amount", "ship-country"]
null_counts = df[key_cols].isnull().sum()
print("=== NULL COUNTS ===")
print(null_counts[null_counts > 0])
print()

# Unique values for categorical columns
print("=== All Status Values ===")
print(df["Status"].value_counts(dropna=False))
print()
print("=== Currency Values ===")
print(df["currency"].value_counts(dropna=False))
print()
print("=== Ship-Country Values ===")
print(df["ship-country"].value_counts(dropna=False))
print()
print("=== Duplicate Order IDs ===")
dup_count = df["Order ID"].duplicated().sum()
print(f"{dup_count:,} duplicate Order ID rows (multi-item orders)")

## 3. Identify & Export Bad Rows

In [None]:
# Define conditions for bad rows
mask_null_currency = df["currency"].isnull()
mask_null_amount = df["Amount"].isnull()
mask_null_country = df["ship-country"].isnull()
mask_null_order_id = df["Order ID"].isnull()
mask_neg_qty = df["Qty"] < 0

# Combine all bad-row conditions
bad_mask = (
    mask_null_currency
    | mask_null_amount
    | mask_null_country
    | mask_null_order_id
    | mask_neg_qty
)

bad_rows = df[bad_mask].copy()
bad_rows["issue"] = ""
bad_rows.loc[mask_null_currency, "issue"] += "null_currency; "
bad_rows.loc[mask_null_amount, "issue"] += "null_amount; "
bad_rows.loc[mask_null_country, "issue"] += "null_ship_country; "
bad_rows.loc[mask_null_order_id, "issue"] += "null_order_id; "
bad_rows.loc[mask_neg_qty, "issue"] += "negative_qty; "

print(f"Total bad rows: {len(bad_rows):,}")
print()
print("Issue breakdown:")
print(f"  Null currency:     {mask_null_currency.sum():,}")
print(f"  Null Amount:       {mask_null_amount.sum():,}")
print(f"  Null ship-country: {mask_null_country.sum():,}")
print(f"  Null Order ID:     {mask_null_order_id.sum():,}")
print(f"  Negative Qty:      {mask_neg_qty.sum():,}")
print()
bad_rows.head(10)

In [None]:
# Export bad rows to a separate CSV
bad_rows.to_csv(BAD_ROWS_PATH, index=False)
print(f"‚úÖ Exported {len(bad_rows):,} bad rows to: {BAD_ROWS_PATH}")

## 4. Fix Data Issues

In [None]:
# Fix 1: Fill empty currency with "INR"
before = df["currency"].isnull().sum()
df["currency"] = df["currency"].fillna("INR")
print(f"‚úÖ currency: {before} nulls ‚Üí {df['currency'].isnull().sum()} nulls")

# Fix 2: Fill empty Amount with 0.0 (cancelled orders)
before = df["Amount"].isnull().sum()
df["Amount"] = df["Amount"].fillna(0.0)
print(f"‚úÖ Amount: {before} nulls ‚Üí {df['Amount'].isnull().sum()} nulls")

# Fix 3: Fill empty ship-country with "IN"
before = df["ship-country"].isnull().sum()
df["ship-country"] = df["ship-country"].fillna("IN")
print(f"‚úÖ ship-country: {before} nulls ‚Üí {df['ship-country'].isnull().sum()} nulls")

## 5. Verify Fixes

In [None]:
# Final null check
null_counts_after = df[key_cols].isnull().sum()
remaining = null_counts_after[null_counts_after > 0]

print("=== REMAINING NULLS ===")
if remaining.empty:
    print("‚úÖ No nulls remaining in key columns!")
else:
    print(remaining)

print()
print("=== VALUE CHECKS ===")
print(f"Unique currencies:     {df['currency'].unique()}")
print(f"Unique ship-countries: {df['ship-country'].unique()}")
print(f"Negative Qty count:    {(df['Qty'] < 0).sum()}")
print(f"Negative Amount count: {(df['Amount'] < 0).sum()}")

## 6. Save Cleaned Data

In [None]:
# Save cleaned data (overwrite original)
df.to_csv(CSV_PATH, index=False)
print(f"‚úÖ Cleaned data saved to: {CSV_PATH}")
print(f"   Rows: {len(df):,}  |  Columns: {len(df.columns)}")

## 7. Re-run Validation Pipeline

> ‚ö†Ô∏è **Important:** If you edited the source modules, restart the kernel before running this cell so the latest code is loaded.

In [None]:
# Force-reload modules to pick up any code changes
import src.ge_validation as ge_mod
import src.pydantic_validation as py_mod
importlib.reload(ge_mod)
importlib.reload(py_mod)

from src.ge_validation import run_ge_validation
from src.pydantic_validation import run_pydantic_validation

df_clean = pd.read_csv(CSV_PATH, low_memory=False)

print("\n" + "=" * 60)
print("   RE-RUNNING VALIDATION ON CLEANED DATA")
print("=" * 60)

ge_summary = run_ge_validation(df_clean)
pydantic_summary = run_pydantic_validation(df_clean)

all_ok = ge_summary["overall_success"] and pydantic_summary["overall_success"]

print("\n" + "=" * 60)
print("   FINAL RESULT")
print("=" * 60)
print(f"   GE Validation      : {'‚úÖ' if ge_summary['overall_success'] else '‚ùå'}")
print(f"   Pydantic Validation : {'‚úÖ' if pydantic_summary['overall_success'] else '‚ùå'}")
print(f"   Overall             : {'‚úÖ ALL PASSED' if all_ok else '‚ùå ISSUES FOUND'}")
print("=" * 60)