# IRON KAGGLE

- **shop_ID**: Unique identifier for each shop.
- **day_of_the_week**: Encoded from 1 to 7, representing the day of the week.
- **date**: Day, month, and year of the data point.
- **number_of_customers**: Quantity of customers that visited the shop on that day.
- **open**: Binary variable; 0 means the shop was closed, while 1 means it was open.
- **promotion**: Binary variable; 0 means no promotions, 1 means there were promotions.
- **state_holiday**: Encoded as 0, 'a', 'b', 'c', indicating the presence of a state holiday (0 if none). 'a', 'b', 'c' represent different state holidays.
- **school_holiday**: Binary variable; 0 means no school holiday, 1 means there was a school holiday.

## Lib Loading

In [121]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression


## Data Loading

In [122]:
data = pd.read_csv('data/sales.csv')
sales_df = data.copy()

## Data Exploration

In [None]:
display(sales_df.shape)
display(sales_df.head())
display(sales_df.info())
display(sales_df.isna().sum())
display(sales_df.duplicated().sum())

## Data Cleaning

In [124]:
#Remove open 0 
sales_df = sales_df[sales_df['Open'] != 0]

## Feature Engineering

### Dates

In [125]:
sales_df['Date'] = pd.to_datetime(data['Date'])

# Split the dates
sales_df['Year'] = sales_df['Date'].dt.year
sales_df['Month'] = sales_df['Date'].dt.month
sales_df['Day'] = sales_df['Date'].dt.day

### Categories

In [None]:
# Display the updated dataframe
display(sales_df.head())

### Additional Features

In [127]:
# Is the day a weekend?
sales_df['Is_weekend'] = sales_df['Day_of_week'].apply(lambda x: 1 if x >= 6 else 0)


# Add a column to indicate if promotions and holidays are concurrent
# Calculate sales per customer
sales_df['Sales_per_customer'] = sales_df['Sales'] / sales_df['Nb_customers_on_day']

# Create interaction features
#sales_df['Promo_Holiday'] = sales_df['Promotion'] * sales_df['State_holiday']
sales_df['Promo_School_Holiday'] = sales_df['Promotion'] * sales_df['School_holiday']

In [128]:
store_stats = sales_df.groupby('Store_ID').agg({
    'Sales': ['mean', 'std', 'median'],
    'Nb_customers_on_day': ['mean', 'std', 'median'],
    'Promotion': 'mean',  # Promotion frequency
    'School_holiday': 'mean',  # School holiday frequency
}).reset_index()

# Flatten column names
store_stats.columns = ['Store_ID'] + [
    f'Store_{x[0]}_{x[1]}' for x in store_stats.columns[1:]
]

# Calculate store performance quartiles
store_stats['Store_sales_quartile'] = pd.qcut(
    store_stats['Store_Sales_mean'], 
    q=4, 
    labels=['Q1', 'Q2', 'Q3', 'Q4']
)

sales_df = sales_df.merge(store_stats, on='Store_ID')

In [129]:
# converts the state holiday column to multiple binary columns
sales_df = pd.get_dummies(sales_df, columns=['State_holiday','Store_sales_quartile'], drop_first=True)

### DF Check

In [None]:
display(sales_df.head())

## Correlation Matrix

In [None]:
corr_matrix = sales_df.corr()

# just for the target column
corr_matrix_target = corr_matrix[['Sales']]

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix_target, annot=True, cmap='coolwarm', fmt='.2f')
plt.show() 

## Data Split

### Define feature & target

In [132]:
#TODO Drop open 0?
features = sales_df.drop(columns=['Sales','Date', 'True_index','Store_ID','Open'])
target = sales_df['Sales']

### Split the training and test

In [133]:
#TODO Try different sample sizes
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

## Scaling

In [134]:
#TODO Try different scalers

normalizer = MinMaxScaler()

normalizer.fit(X_train)

X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [None]:
display(pd.DataFrame(X_train_norm))

## Training

### Simple models

In [136]:
n_neighbors = 3
max_depth = None
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)

In [137]:
n_neighbors = 10
max_depth = None
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)

In [138]:
n_neighbors = None
max_depth = None
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = LinearRegression()

In [139]:
n_neighbors = None
max_depth = 10
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = DecisionTreeRegressor(max_depth=max_depth)

In [140]:
n_neighbors = None
max_depth = 1000
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = DecisionTreeRegressor(max_depth=max_depth)

In [141]:
n_neighbors = None
max_depth = 1000
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = DecisionTreeRegressor(max_depth=max_depth)

### Ensemble

In [142]:
n_neighbors = None
max_depth = None
n_estimators = 100
max_samples = 1000
max_leafs_nodes = None
max_features = None
estimator = LinearRegression()

model = BaggingRegressor(estimator, n_estimators=n_estimators, max_samples=max_samples)

In [143]:
n_neighbors = 10
max_depth = None
n_estimators = 100
max_samples = 1000
max_leafs_nodes = None
max_features = None
estimator = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)

model = BaggingRegressor(estimator, n_estimators=n_estimators, max_samples=max_samples)

In [144]:
n_neighbors = None
max_depth = 1000
n_estimators = 100
max_samples = 1000
max_leafs_nodes = None
max_features = None
estimator = DecisionTreeRegressor(max_depth=max_depth)

model = BaggingRegressor(estimator, n_estimators=n_estimators, max_samples=max_samples)

In [145]:
n_neighbors = None
max_depth = 1000
n_estimators = 100
max_samples = 1000
max_leafs_nodes = None
max_features = None
estimator = DecisionTreeRegressor(max_depth=max_depth)

model = BaggingRegressor(estimator, n_estimators=n_estimators, max_samples=max_samples, bootstrap=False)

In [146]:
n_neighbors = None
max_depth = 1000
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = None

model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)

In [147]:
n_neighbors = None
max_depth = 100
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = DecisionTreeRegressor(max_depth=max_depth)

model = AdaBoostRegressor(n_estimators=n_estimators)

In [148]:
n_neighbors = 10
max_depth = None
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)

model = AdaBoostRegressor(n_estimators=n_estimators)

In [149]:
n_neighbors = None
max_depth = None
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = LinearRegression()

model = AdaBoostRegressor(n_estimators=n_estimators)

In [150]:
n_neighbors = None
max_depth = 100
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = None

model = GradientBoostingRegressor(max_depth=max_depth,  n_estimators=n_estimators, verbose=1)

In [151]:
n_neighbors = None
max_depth = 20
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = None

model = GradientBoostingRegressor(max_depth=max_depth,  n_estimators=n_estimators, verbose=1)

### Active

In [152]:
n_neighbors = None
max_depth = 100
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = None

model = RandomForestRegressor(
    n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, max_leaf_nodes=max_leafs_nodes)

### Fine Tuning

In [155]:
grid = {"n_estimators": [50, 100, 200,500],
        "estimator__max_leaf_nodes": [250, 500, 1000, None],
        "estimator__max_depth":[10,30,50]}

In [None]:
n_neighbors = None
max_depth = 100
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = None

model = GridSearchCV(estimator = model, param_grid = grid, cv=5, n_jobs=-1, verbose=2) 

## Evaluation

In [None]:
model.fit(X_train_norm, y_train)

In [None]:
# Get the regressor name and the estimator name programmatically
regressor_name = type(model).__name__
estimator_name = type(estimator).__name__

print(f"Regressor: {regressor_name}, Estimator: {estimator_name}")

model_name = type(model).__name__

pred = model.predict(X_test_norm)

mae = mean_absolute_error(pred, y_test)
rmse = root_mean_squared_error(pred, y_test)
r2 = model.score(X_test_norm, y_test)


# Create a DataFrame with the current model results
current_results = pd.DataFrame([{
    'Model': f'{model_name} > {estimator_name}' if 'estimator_name' in locals() else model_name,
    'n_neighbors': n_neighbors,
    'max_depth': max_depth,
    'n_estimators': n_estimators,
    'max_samples': max_samples,
    'max_leafs_nodes': max_leafs_nodes,
    'max_features' : max_features,
    'MAE': mae,
    'RMSE': rmse,
    'R2_score': r2,
}])

display(current_results)

file_name = 'results.csv'
# Check if the file already exists
if os.path.isfile(file_name):
    # If the file exists, append the new results without writing the header
    current_results.to_csv(file_name, mode='a', header=False, index=False)
else:
    # If the file does not exist, create it and write the header
    current_results.to_csv(file_name, mode='w', header=True, index=False)

## Tuning