# Customer Sales Dashboard - EDA and Modeling (First Draft)

This Jupyter notebook serves as the first draft for the Streamlit customer sales dashboard app. It includes:
- Data loading and generation (sample data)
- Exploratory Data Analysis (EDA)
- Model training and prediction results

Run cells sequentially to explore.

In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import plotly.express as px
import plotly.graph_objects as go

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

In [1]:
# Cell 2a: Load CSV Dataset
file_path = './data/test_dataset.csv'

def load_csv_data(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"CSV data loaded successfully from {file_path}")
        return df
    except Exception as e:
        print(f"Error loading CSV data: {e}")
        return None

In [None]:
# Cell 2: Load Sample Data Function
def load_sample_data():
    """
    Generate sample sales data for demonstration.
    Columns: Date, Customer_ID, Product, Category, Sales, Quantity, Region
    """
    np.random.seed(42)
    n_rows = 1000
    start_date = datetime(2020, 1, 1)
    dates = [start_date + timedelta(days=np.random.randint(0, 1460)) for _ in range(n_rows)]
    
    categories = ['Electronics', 'Clothing', 'Books', 'Home & Garden']
    regions = ['North', 'South', 'East', 'West']
    products = [f'Product_{i}' for i in range(1, 101)]
    
    data = {
        'Date': pd.to_datetime(dates),
        'Customer_ID': np.random.randint(1, 501, n_rows),
        'Product': np.random.choice(products, n_rows),
        'Category': np.random.choice(categories, n_rows),
        'Sales': np.random.uniform(10, 1000, n_rows).round(2),
        'Quantity': np.random.randint(1, 10, n_rows),
        'Region': np.random.choice(regions, n_rows)
    }
    
    df = pd.DataFrame(data)
    df['Total_Sales'] = df['Sales'] * df['Quantity']
    df.sort_values('Date', inplace=True)
    
    return df

In [None]:
# Cell 3: Load the data
df = load_sample_data()
print(f"Data shape: {df.shape}")
df.head()

## Exploratory Data Analysis (EDA)

In [None]:
# Cell 4: Data Overview
print("Data Overview")
print(f"Total Records: {len(df)}")
print(f"Total Sales: ${df['Total_Sales'].sum():,.2f}")
print(f"Unique Customers: {df['Customer_ID'].nunique()}")
print(f"Unique Products: {df['Product'].nunique()}")

display(df.head())

In [None]:
# Cell 5: Summary Statistics
print("Summary Statistics")
display(df.describe())

In [None]:
# Cell 6: Sales over time (Monthly Trend)
fig, ax = plt.subplots(figsize=(10, 5))
monthly_sales = df.resample('M', on='Date')['Total_Sales'].sum()
monthly_sales.plot(ax=ax, title='Monthly Sales Trend')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.show()

In [None]:
# Cell 7: Sales by Category
fig, ax = plt.subplots(figsize=(10, 5))
category_sales = df.groupby('Category')['Total_Sales'].sum().sort_values(ascending=False)
sns.barplot(x=category_sales.index, y=category_sales.values, ax=ax)
ax.set_title('Sales by Category')
ax.set_ylabel('Total Sales')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Cell 8: Sales by Region
fig, ax = plt.subplots(figsize=(10, 5))
region_sales = df.groupby('Region')['Total_Sales'].sum().sort_values(ascending=False)
sns.barplot(x=region_sales.index, y=region_sales.values, ax=ax)
ax.set_title('Sales by Region')
ax.set_ylabel('Total Sales')
plt.show()

In [None]:
# Cell 9: Top 10 Customers by Total Spend
customer_spend = df.groupby('Customer_ID')['Total_Sales'].sum().sort_values(ascending=False).head(10)
fig, ax = plt.subplots(figsize=(10, 5))
customer_spend.plot(kind='bar', ax=ax)
ax.set_title('Top 10 Customers by Total Spend')
ax.set_ylabel('Total Spend')
plt.xticks(rotation=45)
plt.show()

## Model Prediction Results

In [None]:
# Cell 10: Train Model Function
def train_model(df):
    """
    Train a simple Random Forest model to predict Total_Sales based on features.
    Features: Quantity, (encoded) Category, Region, (derived) Month, Day
    """
    # Feature Engineering
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['Year'] = df['Date'].dt.year
    
    # Encode categoricals
    df_encoded = pd.get_dummies(df, columns=['Category', 'Region'], drop_first=True)
    
    # Features and target
    feature_cols = ['Quantity', 'Month', 'Day', 'Year'] + [col for col in df_encoded.columns if 'Category_' in col or 'Region_' in col]
    X = df_encoded[feature_cols]
    y = df_encoded['Total_Sales']
    
    # Split and train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return model, X_test, y_test, y_pred, mae, r2, feature_cols

# Train the model
model, X_test, y_test, y_pred, mae, r2, feature_cols = train_model(df)

In [None]:
# Cell 11: Model Performance
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.4f}")

In [None]:
# Cell 12: Actual vs Predicted Sales
fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Sales', 'y': 'Predicted Sales'},
                 title='Actual vs Predicted Total Sales')
fig.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()],
                         mode='lines', name='Perfect Prediction', line=dict(color='red', dash='dash')))
fig.show()

In [None]:
# Cell 13: Feature Importance
importances = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

fig = px.bar(importances.head(10), x='Importance', y='Feature', orientation='h',
             title='Top 10 Feature Importances')
fig.show()

In [None]:
# Cell 14: Interactive Prediction (Manual Input Example)
# For notebook, we'll use fixed values; in Streamlit, this would be sliders
quantity = 5
category = 'Electronics'
region = 'North'
month = 6
day = 15
year = 2022

# Prepare input
input_df = pd.DataFrame({'Quantity': [quantity], 'Month': [month], 'Day': [day], 'Year': [year]})

# Add dummy columns for categories and regions (drop_first=True means first is reference)
categories = ['Electronics', 'Clothing', 'Books', 'Home & Garden']
regions = ['North', 'South', 'East', 'West']

for cat in categories[1:]:  # Assuming Electronics is first (drop_first=True)
    col = f'Category_{cat}'
    if col in feature_cols:
        input_df[col] = 1 if category == cat else 0

for reg in regions[1:]:
    col = f'Region_{reg}'
    if col in feature_cols:
        input_df[col] = 1 if region == reg else 0

# Ensure all columns are present
for col in feature_cols:
    if col not in input_df.columns:
        input_df[col] = 0

input_df = input_df[feature_cols]

pred = model.predict(input_df)[0]
print(f"Predicted Total Sales for input: ${pred:.2f}")

## Next Steps
- Convert this notebook logic to Streamlit components for interactivity.
- Add more advanced visualizations or models as needed.
- Integrate real data loading instead of sample generation.