# IRON KAGGLE

- **shop_ID**: Unique identifier for each shop.
- **day_of_the_week**: Encoded from 1 to 7, representing the day of the week.
- **date**: Day, month, and year of the data point.
- **number_of_customers**: Quantity of customers that visited the shop on that day.
- **open**: Binary variable; 0 means the shop was closed, while 1 means it was open.
- **promotion**: Binary variable; 0 means no promotions, 1 means there were promotions.
- **state_holiday**: Encoded as 0, 'a', 'b', 'c', indicating the presence of a state holiday (0 if none). 'a', 'b', 'c' represent different state holidays.
- **school_holiday**: Binary variable; 0 means no school holiday, 1 means there was a school holiday.

## Lib Loading

In [242]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression


In [243]:
def compute_store_stats(df):

    store_stats = df.groupby('Store_ID').agg({
        'Sales': ['mean', 'std', 'median'],
        'Nb_customers_on_day': ['mean', 'std', 'median'],
        'Promotion': 'mean',  # Promotion frequency
        'School_holiday': 'mean',  # School holiday frequency
    }).reset_index()

    # Flatten column names
    store_stats.columns = ['Store_ID'] + [
        f'Store_{col[0]}_{col[1]}' for col in store_stats.columns[1:]
    ]

    # Replace infinity values with NaN
    store_stats.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Calculate store performance quartiles
    store_stats['Store_sales_quartile'] = pd.qcut(
        store_stats['Store_Sales_mean'],
        q=4,
        labels=['Q1', 'Q2', 'Q3', 'Q4']
    )

    return store_stats

def transform_data(df, store_stats=None):

    df = df.copy()

    # Remove rows where 'Open' == 0
    #if 'Open' in df.columns:
     #   df = df[df['Open'] != 0]

    # Convert 'Date' to datetime
    df['Date'] = pd.to_datetime(df['Date'])

    # Extract date features
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

    # Create 'Is_weekend' feature
    if 'Day_of_week' in df.columns:
        df['Is_weekend'] = df['Day_of_week'].apply(lambda x: 1 if x >= 6 else 0)
    else:
        df['Is_weekend'] = np.nan

    # Create 'Sales_per_customer' if possible
    if 'Sales' in df.columns and 'Nb_customers_on_day' in df.columns:
        df['Sales_per_customer'] = df['Sales'] / df['Nb_customers_on_day']
        # Replace infinity values with NaN
        df['Sales_per_customer'].replace([np.inf, -np.inf], np.nan, inplace=True)
    else:
        df['Sales_per_customer'] = np.nan

    # Create 'Promo_School_Holiday'
    if 'Promotion' in df.columns and 'School_holiday' in df.columns:
        df['Promo_School_Holiday'] = df['Promotion'] * df['School_holiday']
    else:
        df['Promo_School_Holiday'] = np.nan

    # Merge store_stats if available
    if store_stats is not None:
        df = df.merge(store_stats, on='Store_ID', how='left')

    # One-hot encode 'State_holiday' and 'Store_sales_quartile'
    columns_to_encode = []
    if 'State_holiday' in df.columns:
        columns_to_encode.append('State_holiday')
    if 'Store_sales_quartile' in df.columns:
        columns_to_encode.append('Store_sales_quartile')
    if columns_to_encode:
        df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)

    return df

## Data Loading

In [244]:
data = pd.read_csv('data/sales.csv')

test_data_no_target = pd.read_csv('data/ironkaggle_notarget.csv')
solutions = pd.read_csv('data/ironkaggle_solutions.csv')
test_data = test_data_no_target.merge(solutions, on='True_index')

sales_df = data.copy()

In [None]:
# Compute store statistics
store_stats = compute_store_stats(sales_df)

# Transform the sales data
transformed_sales_data = transform_data(sales_df, store_stats=store_stats)

In [None]:
# Compute store statistics
store_stats_test = compute_store_stats(test_data)

# Transform the test data
transformed_test_data = transform_data(test_data, store_stats=store_stats_test)

## Data Split

### Define feature & target

In [247]:
sales_df = transformed_sales_data.copy()
test_data = transformed_test_data.copy()

In [None]:
display(sales_df)
display(test_data)

In [249]:
features = sales_df.drop(columns=['Sales','Date', 'True_index','Store_ID','Open'])
target = sales_df['Sales']

test_target = test_data[['Sales']]
test_data = test_data.drop(columns=['Sales','Date', 'True_index','Store_ID','Open'])

### Split the training

In [250]:
#TODO Try different sample sizes
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

## Scaling

In [251]:
#TODO Try different scaler
normalizer = MinMaxScaler()

normalizer.fit(X_train)

X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [252]:

test_normalizer = MinMaxScaler()

test_normalizer.fit(test_data)

test_norm = normalizer.transform(test_data)

## Training

### Simple models

In [253]:
n_neighbors = 3
max_depth = None
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)

In [254]:
n_neighbors = 10
max_depth = None
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)

In [255]:
n_neighbors = None
max_depth = None
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = LinearRegression()

In [256]:
n_neighbors = None
max_depth = 10
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = DecisionTreeRegressor(max_depth=max_depth)

In [257]:
n_neighbors = None
max_depth = 1000
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = DecisionTreeRegressor(max_depth=max_depth)

In [258]:
n_neighbors = None
max_depth = 1000
n_estimators = None
max_samples = None
max_leafs_nodes = None
max_features = None

model = DecisionTreeRegressor(max_depth=max_depth)

### Ensemble

In [259]:
n_neighbors = None
max_depth = None
n_estimators = 100
max_samples = 1000
max_leafs_nodes = None
max_features = None
estimator = LinearRegression()

model = BaggingRegressor(estimator, n_estimators=n_estimators, max_samples=max_samples)

In [260]:
n_neighbors = 10
max_depth = None
n_estimators = 100
max_samples = 1000
max_leafs_nodes = None
max_features = None
estimator = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)

model = BaggingRegressor(estimator, n_estimators=n_estimators, max_samples=max_samples)

In [261]:
n_neighbors = None
max_depth = 1000
n_estimators = 100
max_samples = 1000
max_leafs_nodes = None
max_features = None
estimator = DecisionTreeRegressor(max_depth=max_depth)

model = BaggingRegressor(estimator, n_estimators=n_estimators, max_samples=max_samples)

In [262]:
n_neighbors = None
max_depth = 1000
n_estimators = 100
max_samples = 1000
max_leafs_nodes = None
max_features = None
estimator = DecisionTreeRegressor(max_depth=max_depth)

model = BaggingRegressor(estimator, n_estimators=n_estimators, max_samples=max_samples, bootstrap=False)

In [263]:
n_neighbors = None
max_depth = 1000
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = None

model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)

In [264]:
n_neighbors = None
max_depth = 100
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = DecisionTreeRegressor(max_depth=max_depth)

model = AdaBoostRegressor(n_estimators=n_estimators)

In [265]:
n_neighbors = 10
max_depth = None
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = KNeighborsRegressor(n_neighbors=n_neighbors, n_jobs=-1)

model = AdaBoostRegressor(n_estimators=n_estimators)

In [266]:
n_neighbors = None
max_depth = None
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = LinearRegression()

model = AdaBoostRegressor(n_estimators=n_estimators)

In [267]:
n_neighbors = None
max_depth = 100
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = None

model = GradientBoostingRegressor(max_depth=max_depth,  n_estimators=n_estimators, verbose=1)

In [268]:
n_neighbors = None
max_depth = 20
n_estimators = 100
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = None

model = GradientBoostingRegressor(max_depth=max_depth,  n_estimators=n_estimators, verbose=1)

### Active

In [269]:
n_neighbors = None
max_depth = 400
n_estimators = 50
max_samples = None
max_leafs_nodes = None
max_features = None
estimator = None

model = RandomForestRegressor(
    n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)

### Fine Tuning

In [270]:
param_grid = {
    'n_estimators': [50, 200],
    'max_depth': [10, 400],
}

In [271]:
#model = GridSearchCV(estimator = RandomForestRegressor(), param_grid = param_grid, cv=5, n_jobs=-1, verbose=4) 


## Evaluation

In [None]:
model.fit(X_train_norm, y_train)

In [273]:
#best_model = model.best_estimator_

In [None]:
""" # Convert the cv_results_ to a DataFrame
results_df = pd.DataFrame(model.cv_results_)

# Define the file name
file_name = 'grid_search_results.csv'

# Check if the file already exists
if os.path.isfile(file_name):
    # If the file exists, append the new results without writing the header
    results_df.to_csv(file_name, mode='a', header=False, index=False)
else:
    # If the file does not exist, create it and write the header
    results_df.to_csv(file_name, mode='w', header=True, index=False) """

In [275]:
# Get the regressor name and the estimator name programmatically
estimator_name = type(estimator).__name__
model_name = type(model).__name__

pred = model.predict(test_norm)

test_data_no_target['Predicted_Sales'] = pred
result = test_data_no_target[['True_index', 'Predicted_Sales']]

mae = mean_absolute_error(pred, test_target)
rmse = root_mean_squared_error(pred, test_target)
r2 = model.score(test_norm, test_target)


In [None]:
# Create a DataFrame with the current model results
current_results = pd.DataFrame([{
    'Model': f'{model_name} > {estimator_name}' if 'estimator_name' in locals() else model_name,
    'n_neighbors': n_neighbors,
    'max_depth': max_depth,
    'n_estimators': n_estimators,
    'max_samples': max_samples,
    'max_leafs_nodes': max_leafs_nodes,
    'max_features' : max_features,
    'MAE': mae,
    'RMSE': rmse,
    'R2_score': r2,
}])

display(current_results)


## Export

In [277]:

file_name = 'results.csv'
# Check if the file already exists
if os.path.isfile(file_name):
    # If the file exists, append the new results without writing the header
    current_results.to_csv(file_name, mode='a', header=False, index=False)
else:
    # If the file does not exist, create it and write the header
    current_results.to_csv(file_name, mode='w', header=True, index=False)

In [278]:
result.to_csv('ironkaggle_final_batman_robin.csv', mode='a', header=True, index=False)