In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import xgboost as xgb

from helper import DateToOrdinal

In [2]:
data_folder = "data"
filename = "train.csv"
df = pd.read_csv(os.path.join("..", data_folder, filename))#, parse_dates=True, index_col="date")

In [3]:
y_column = "orders"
# X_necessary_columns = ["date", "warehouse", "holiday_name", "holiday", "shops_closed", "winter_school_holidays", "school_holidays"]
X_necessary_columns_remainder = ["holiday", "shops_closed", "winter_school_holidays", "school_holidays"]

In [4]:
preproc_warehouse = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False))

In [5]:
preproc_holiday_name = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="0"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False))

In [6]:
preproc_remainder = make_pipeline(
    SimpleImputer(strategy="most_frequent")
    )

In [7]:
preproc_pipeline = make_column_transformer(
    (DateToOrdinal(), ["date"]),
    (preproc_warehouse, ["warehouse"]),
    (preproc_holiday_name, ["holiday_name"]),
    (preproc_remainder, X_necessary_columns_remainder),
    remainder="drop"
)

In [8]:
X = df.drop(columns=y_column)
y = df[y_column]

In [9]:
X_preprocessed = preproc_pipeline.fit_transform(X)

Using "neg_mean_absolute_percentage_error" instead of "mean_absolute_percentage_error" aligns with the scoring conventions of machine learning libraries like scikit-learn, which optimize for higher scores, ensuring consistency and proper functionality in model evaluation and hyperparameter tuning processes.

### Baseline Score

In [18]:
# I convert it in a positive MAPE % score.
abs(cross_val_score(LinearRegression(), X_preprocessed, y, cv=5, scoring="neg_mean_absolute_percentage_error").mean()) * 100

26.013162485586427

### Trying more advanced models

In [27]:
models = {"Ridge": Ridge(alpha=0.1), "Lasso": Lasso(alpha=0.1), "GradientBoostingRegressor": GradientBoostingRegressor(), "RandomForestRegressor": RandomForestRegressor(), "SVR": SVR(kernel="rbf")}
results = []

# Loop through the models' values
for model in models.values():

  # Perform cross-validation
  cv_scores = abs(cross_val_score(model, X_preprocessed, y, cv=5, scoring="neg_mean_absolute_percentage_error").mean()) * 100

  # Append the results
  results.append(cv_scores)

  model = cd_fast.enet_coordinate_descent(


In [28]:
pd.DataFrame({"Score": results,
              "Model":models.keys()}).sort_values(by="Score", ascending=True)

Unnamed: 0,Score,Model
3,23.343229,RandomForestRegressor
2,24.244283,GradientBoostingRegressor
1,25.967939,Lasso
0,26.007326,Ridge
4,54.290172,SVR


### Trying XGBoost

In [31]:
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
xgb_pipe = make_pipeline(xgb_regressor)

In [32]:
abs(cross_val_score(xgb_pipe, X_preprocessed, y, cv=5, scoring="neg_mean_absolute_percentage_error").mean()) * 100

23.290975526375217