In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import xgboost as xgb

from helper import DateToOrdinal

In [2]:
data_folder = "data"
filename = "train.csv"
df = pd.read_csv(os.path.join("..", data_folder, filename))#, parse_dates=True, index_col="date")

In [3]:
y_column = "orders"
# X_necessary_columns = ["date", "warehouse", "holiday_name", "holiday", "shops_closed", "winter_school_holidays", "school_holidays"]
X_necessary_columns_remainder = ["holiday", "shops_closed", "winter_school_holidays", "school_holidays"]

In [4]:
preproc_warehouse = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse_output=False))

In [5]:
preproc_holiday_name = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="0"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False))

In [6]:
preproc_remainder = make_pipeline(
    SimpleImputer(strategy="most_frequent")
    )

In [7]:
preproc_pipeline = make_column_transformer(
    (DateToOrdinal(), ["date"]),
    (preproc_warehouse, ["warehouse"]),
    (preproc_holiday_name, ["holiday_name"]),
    (preproc_remainder, X_necessary_columns_remainder),
    remainder="drop"
)

In [8]:
X = df.drop(columns=y_column)
y = df[y_column]

In [9]:
X_preprocessed = preproc_pipeline.fit_transform(X)

Using "neg_mean_absolute_percentage_error" instead of "mean_absolute_percentage_error" aligns with the scoring conventions of machine learning libraries like scikit-learn, which optimize for higher scores, ensuring consistency and proper functionality in model evaluation and hyperparameter tuning processes.

### Baseline Score

In [10]:
# I convert it in a positive MAPE % score.
abs(cross_val_score(LinearRegression(), X_preprocessed, y, cv=5, scoring="neg_mean_absolute_percentage_error").mean()) * 100

26.013162485586427

### Trying more advanced models

In [11]:
models = {"Ridge": Ridge(alpha=0.1), "Lasso": Lasso(alpha=0.1), "GradientBoostingRegressor": GradientBoostingRegressor(), "RandomForestRegressor": RandomForestRegressor(), "SVR": SVR(kernel="rbf")}
results = []

# Loop through the models' values
for model in models.values():

  # Perform cross-validation
  cv_scores = abs(cross_val_score(model, X_preprocessed, y, cv=5, scoring="neg_mean_absolute_percentage_error").mean()) * 100

  # Append the results
  results.append(cv_scores)

  model = cd_fast.enet_coordinate_descent(


In [12]:
pd.DataFrame({"Score": results,
              "Model":models.keys()}).sort_values(by="Score", ascending=True)

Unnamed: 0,Score,Model
3,23.218827,RandomForestRegressor
2,24.244486,GradientBoostingRegressor
1,25.967939,Lasso
0,26.007326,Ridge
4,54.290172,SVR


### Trying XGBoost

In [13]:
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
xgb_pipe = make_pipeline(xgb_regressor)

In [14]:
abs(cross_val_score(xgb_pipe, X_preprocessed, y, cv=5, scoring="neg_mean_absolute_percentage_error").mean()) * 100

23.290975526375217

### Seperate all warehouses

In [15]:
warehouses = df["warehouse"].unique().tolist()
for warehouse in warehouses:
    warehouse_data = df[df["warehouse"] == warehouse]

    warehouse_data_X = warehouse_data.drop(columns=y_column)
    warehouse_data_y = warehouse_data[y_column]

    warehouse_data_X_preprocessed = preproc_pipeline.fit_transform(warehouse_data_X)

    final_score = abs(cross_val_score(xgb_pipe, warehouse_data_X_preprocessed, warehouse_data_y, cv=5, scoring="neg_mean_absolute_percentage_error").mean()) * 100

    print(warehouse, final_score)
    print()


Prague_1 10.904648180825454

Brno_1 9.823585028086912

Prague_2 10.009347523034073

Prague_3 10.12853783119255

Munich_1 42.11815666898875

Frankfurt_1 12.084135614105671

Budapest_1 8.097135245234547



### Trying to drop all columns but Orders and date + warehouse

In [20]:
hard_drop_df = df[["orders", "date", "warehouse"]]
date_encoder = DateToOrdinal()
hard_drop_df["date"] = date_encoder.fit_transform(hard_drop_df["date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard_drop_df["date"] = date_encoder.fit_transform(hard_drop_df["date"])


Unnamed: 0,orders,date,warehouse
0,6895.0,0,Prague_1
1,6584.0,1,Prague_1
2,7030.0,2,Prague_1
3,6550.0,3,Prague_1
4,6910.0,4,Prague_1
...,...,...,...
7335,6733.0,1187,Budapest_1
7336,6492.0,1188,Budapest_1
7337,6661.0,1189,Budapest_1
7338,6843.0,1190,Budapest_1


In [21]:
for warehouse in warehouses:
    warehouse_data = hard_drop_df[hard_drop_df["warehouse"] == warehouse]

    warehouse_data_X = warehouse_data.drop(columns=[y_column, "warehouse"])
    warehouse_data_y = warehouse_data[y_column]

    final_score = abs(cross_val_score(xgb_pipe, warehouse_data_X, warehouse_data_y, cv=5, scoring="neg_mean_absolute_percentage_error").mean()) * 100

    print(warehouse, final_score)
    print()

Prague_1 11.169047729247325

Brno_1 10.001992053566486

Prague_2 10.268778518754171

Prague_3 10.411751055475424

Munich_1 42.16773900534548

Frankfurt_1 12.12800640538928

Budapest_1 8.239607002521963



### The above scores look like the features about holiday etc. are just not relevant.