# E-commerce Data Cleaning and Preprocessing

This notebook focuses on cleaning and preparing the e-commerce dataset for analysis. We'll perform:
1. Data loading and initial inspection
2. Handling missing values
3. Removing duplicates
4. Handling outliers
5. Data validation
6. Saving cleaned dataset

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import sys
sys.path.append('..')

from src.data_processing import DataProcessor
from src.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

%matplotlib inline
plt.style.use('seaborn')

In [None]:
# Load the raw data
raw_data = pd.read_csv(RAW_DATA_DIR / 'sales_data.csv')
print(f"Dataset shape: {raw_data.shape}")
raw_data.head()

## Initial Data Analysis

In [None]:
# Data info and statistics
print("\nDataset Info:")
raw_data.info()

print("\nBasic Statistics:")
raw_data.describe()

In [None]:
# Initialize data processor
processor = DataProcessor(raw_data)

# Check missing values
missing_values = processor.check_missing_values()
print("Missing values percentage:")
for col, pct in missing_values.items():
    if pct > 0:
        print(f"{col}: {pct:.2f}%")

## Handle Missing Values

In [None]:
# Define imputation strategy
imputation_strategy = {
    'price': 'median',
    'quantity': 'median',
    'customer_id': 'drop',
    'product_id': 'drop',
    'category': 'mode'
}

# Apply imputation
cleaned_data = processor.handle_missing_values(imputation_strategy)
print("\nMissing values after imputation:")
print(cleaned_data.isnull().sum())

## Remove Duplicates

In [None]:
# Check for duplicates
duplicate_count = cleaned_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Remove duplicates
cleaned_data = processor.handle_duplicates()
print(f"Shape after removing duplicates: {cleaned_data.shape}")

## Handle Outliers

In [None]:
# Visualize numerical distributions
numerical_cols = ['price', 'quantity']
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for i, col in enumerate(numerical_cols):
    sns.boxplot(data=cleaned_data, y=col, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

In [None]:
# Handle outliers
cleaned_data = processor.handle_outliers(
    columns=['price', 'quantity'],
    method='iqr',
    threshold=1.5
)

print(f"Shape after handling outliers: {cleaned_data.shape}")

## Format Dates

In [None]:
# Convert date columns
cleaned_data = processor.format_dates(['order_date'])
print("\nDate column info:")
print(cleaned_data['order_date'].dtype)

## Validate Data

In [None]:
# Perform validation checks
validation_results = processor.validate_data()

print("Validation Results:")
for check, result in validation_results.items():
    print(f"{check}: {'Passed' if not result else 'Failed'}")

## Save Cleaned Dataset

In [None]:
# Save processed data
output_path = PROCESSED_DATA_DIR / 'cleaned_sales_data.csv'
cleaned_data.to_csv(output_path, index=False)
print(f"Cleaned data saved to: {output_path}")