# RetailVista: EDA, Preprocessing, and Model Training
**Author: Leslie Fernando**
This notebook covers exploratory data analysis, data cleaning, feature engineering, model training, and evaluation for supermarket sales prediction.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import joblib

## 1. Load Data
*Notebook by Leslie Fernando*
Place your dataset (e.g., `supermarket_sales.csv`) in the `data/` folder.

In [None]:
# Load data
import os
data_path = '../data/supermarket_sales.csv'
if not os.path.exists(data_path):
    raise FileNotFoundError('Dataset not found! Please place supermarket_sales.csv in the data/ folder.')
df = pd.read_csv(data_path)  # Update filename if needed
df.head()

## 2. Data Preprocessing
- Handle missing values
- Encode categorical variables
- Feature engineering (e.g., extract date features)
- Check for outliers and data consistency
- Visualize distributions and correlations

## 3. Model Selection, Training, and Tuning
Try multiple models (Random Forest, XGBoost) and tune hyperparameters for best results.

## 4. Model Evaluation and Selection
Evaluate models using RMSE, MAE, and R². Select the best model for saving and deployment.

## 5. Save the Best Model and Deploy
Save the best performing model to the `model/` folder for use in the Streamlit app.

In [None]:
# Save the best model for deployment
import joblib
joblib.dump(best_model, '../model/sales_predictor.joblib')
print('Best model saved to ../model/sales_predictor.joblib')

In [None]:
# Evaluate both models and select the best one
def print_metrics(y_true, y_pred, model_name):
    print(f'{model_name} Results:')
    print('RMSE:', np.sqrt(mean_squared_error(y_true, y_pred)))
    print('MAE:', mean_absolute_error(y_true, y_pred))
    print('R2:', r2_score(y_true, y_pred))
    print('-'*30)
rf_pred = best_rf.predict(X_test)
xgb_pred = best_xgb.predict(X_test)
print_metrics(y_test, rf_pred, 'Random Forest (Tuned)')
print_metrics(y_test, xgb_pred, 'XGBoost (Tuned)')
# Choose the best model based on RMSE
best_model = best_rf if np.sqrt(mean_squared_error(y_test, rf_pred)) < np.sqrt(mean_squared_error(y_test, xgb_pred)) else best_xgb

In [None]:
# Hyperparameter tuning example (XGBoost)
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=3, scoring='neg_root_mean_squared_error')
xgb_grid.fit(X_train, y_train)
print('Best XGB Params:', xgb_grid.best_params_)
best_xgb = xgb_grid.best_estimator_

In [None]:
# Hyperparameter tuning example (Random Forest)
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)
print('Best RF Params:', grid_search.best_params_)
best_rf = grid_search.best_estimator_

### Outlier Detection and Correlation Analysis
Check for outliers and analyze feature correlations to improve model performance.

In [None]:
# Outlier visualization and correlation heatmap
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols].boxplot(figsize=(12,6), rot=45)
plt.title('Boxplot of Numeric Features')
plt.show()
plt.figure(figsize=(10,8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Visualize missing values and basic statistics
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,4))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()
df.describe()

### Check Data Columns and Types
It's important to check the column names and types to ensure compatibility with the rest of the notebook.

In [None]:
# Display columns and data types
df.info()
df.head()

In [None]:
# Example preprocessing
df = df.dropna()
# Convert date column if present
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['Month'] = df['Date'].dt.month
# Encode categoricals
for col in df.select_dtypes(include='object').columns:
    if col != 'Date':
        df[col] = df[col].astype('category').cat.codes
df.head()

## 3. Feature Selection & Train-Test Split
Update the target and features as per your dataset.

In [None]:
# Example: Predict 'Total' sales
target = 'Total'  # Change as needed
features = [col for col in df.columns if col != target]  # Exclude target
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 4. Model Training (Random Forest & XGBoost)

In [None]:
# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

## 5. Evaluation
Compare models using RMSE, MAE, and R².

In [None]:
def print_metrics(y_true, y_pred, model_name):
    print(f'{model_name} Results:')
    print('RMSE:', np.sqrt(mean_squared_error(y_true, y_pred)))
    print('MAE:', mean_absolute_error(y_true, y_pred))
    print('R2:', r2_score(y_true, y_pred))
    print('-'*30)
print_metrics(y_test, rf_pred, 'Random Forest')
print_metrics(y_test, xgb_pred, 'XGBoost')

## 6. Save Best Model
Save the best performing model to the `model/` folder.

In [None]:
# Save XGBoost model as example
joblib.dump(xgb_model, '../model/sales_predictor.joblib')
print('Model saved to ../model/sales_predictor.joblib')