In [97]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from helper import DateToOrdinal

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import cross_val_score

import xgboost as xgb

from sklearn.model_selection import GridSearchCV

In [98]:
data_folder = "data"
filename = "train.csv"
df = pd.read_csv(os.path.join("..", data_folder, filename))
df['date'] = pd.to_datetime(df['date'])
test_cols_plus_y = ["orders", "warehouse", "date", "holiday_name", "holiday", "shops_closed", "winter_school_holidays", "school_holidays", "id"]
df = df[test_cols_plus_y]

In [99]:
# Drop categorical columns
df = df.drop(columns=["holiday_name", "id"])

In [100]:
# Helpful functions
def create_lagged_features(df_l, value, lags):
    for lag in range(1, lags + 1):
        df_l[f't-{lag}'] = df_l[value].shift(lag)
    df_l.dropna(inplace=True)
    return df_l

def add_cyclic_sin_cos_features(df_c, datecolumn = "date"):
    # Create sin and cos features for day of year
    df_c['dayofyear_sin'] = np.sin(2 * np.pi * df_c[datecolumn].dt.dayofyear/365.25)
    df_c['dayofyear_cos'] = np.cos(2 * np.pi * df_c[datecolumn].dt.dayofyear/365.25)

    # Create sin and cos features for day of week
    df_c['dayofweek_sin'] = np.sin(2 * np.pi * df_c[datecolumn].dt.dayofweek/7)
    df_c['dayofweek_cos'] = np.cos(2 * np.pi * df_c[datecolumn].dt.dayofweek/7)

    return df_c

In [101]:
def cross_val_xgb(data):

    result_list = []

    try:
        data = data.set_index("date")
    except:
        pass

    xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
    xgb_pipe = make_pipeline(xgb_regressor)

    warehouses = data["warehouse"].unique().tolist()

    for warehouse in warehouses:
        warehouse_data = data[data["warehouse"] == warehouse]

        warehouse_data_X = warehouse_data.drop(columns=["orders", "warehouse"])
        warehouse_data_y = warehouse_data["orders"]

        final_score = abs(cross_val_score(xgb_pipe, warehouse_data_X, warehouse_data_y, cv=5, scoring="neg_mean_absolute_percentage_error").mean())

        result_list.append(final_score)
        #print(warehouse, final_score)

    print(np.mean(result_list))

In [102]:
# just the raw data
cross_val_xgb(df.copy())

0.1625589391568169


In [103]:
subset_df = df[["orders", "date", "warehouse"]]
subset_df["daterange"] = list(range(len(df)))
cross_val_xgb(subset_df.copy())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df["daterange"] = list(range(len(df)))


0.14908633671881769


In [104]:
df_lagged = create_lagged_features(df.copy(), "orders", lags=14)

In [105]:
# Added only lagged features
cross_val_xgb(df_lagged.copy())

0.09187430341703094


In [106]:
df_lagged["daterange"] = list(range(len(df_lagged)))
# Added lagged + timesteps
cross_val_xgb(df_lagged.copy())

0.09359299352999409


In [107]:
# Added only cyclic features
df_sin_cos = add_cyclic_sin_cos_features(df.copy(), datecolumn = "date")
cross_val_xgb(df_sin_cos.copy())

0.16391978845647534


In [108]:
# all together
final_df = add_cyclic_sin_cos_features(df.copy(), datecolumn = "date")
final_df = create_lagged_features(final_df.copy(), "orders", lags=14)
final_df["daterange"] = list(range(len(final_df)))
cross_val_xgb(final_df.copy())

0.08275420684864583


In [109]:
final_df.shape

(7326, 26)

## Feature Selection

In [110]:
# Assume 'df' is your DataFrame
df_without_cat = df.drop(columns=["warehouse", "date"])
# Calculate variance for each feature
variances = df_without_cat.var()

# Sort features by variance
sorted_variances = variances.sort_values()

# Print features with lowest variance
print(sorted_variances.head(10))

# Optionally, set a threshold and remove low-variance features
threshold = 0.01  # adjust as needed
low_variance_features = sorted_variances[sorted_variances < threshold].index
df_filtered = df.drop(columns=low_variance_features)

school_holidays           7.035237e-03
shops_closed              1.078186e-02
holiday                   2.650912e-02
winter_school_holidays    2.907835e-02
orders                    4.764093e+06
dtype: float64


In [111]:
# all together with filtered df
final_df = add_cyclic_sin_cos_features(df_filtered.copy(), datecolumn = "date")
final_df = create_lagged_features(final_df.copy(), "orders", lags=14)
final_df["daterange"] = list(range(len(final_df)))
cross_val_xgb(final_df.copy())

0.08413312334555127


In [112]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

def randomized_hyper_tuning(model, X, y):
    # Define the parameter distribution
    param_dist = {
        'xgbregressor__learning_rate': uniform(0.009, 0.1),
        'xgbregressor__n_estimators': randint(100, 600),
        'xgbregressor__max_depth': randint(3, 8),
        'xgbregressor__min_child_weight': randint(2, 6),
        'xgbregressor__gamma': uniform(0, 0.2),
    }

    # Set up the RandomizedSearchCV
    random_search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=100,  # Number of parameter settings that are sampled
        cv=5,
        scoring='neg_mean_absolute_percentage_error',
        verbose=1,
        n_jobs=-1
    )
    random_search.fit(X, y)
    print(random_search.best_score_)
    print(random_search.best_params_)

    return random_search.best_estimator_

In [113]:
def cross_val_xgb_with_rand(data):

    result_list = []

    try:
        data = data.set_index("date")
    except:
        pass

    xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
    xgb_pipe = make_pipeline(xgb_regressor)

    warehouses = data["warehouse"].unique().tolist()

    for warehouse in warehouses:
        warehouse_data = data[data["warehouse"] == warehouse]

        warehouse_data_X = warehouse_data.drop(columns=["orders", "warehouse"])
        warehouse_data_y = warehouse_data["orders"]

        the_model = randomized_hyper_tuning(xgb_pipe, warehouse_data_X, warehouse_data_y)

        final_score = abs(cross_val_score(the_model, warehouse_data_X, warehouse_data_y, cv=5, scoring="neg_mean_absolute_percentage_error").mean())

        result_list.append(final_score)
        #print(warehouse, final_score)

    print(np.mean(result_list))

In [114]:
def cross_val_xgb_with_prophet(data):

    result_list = []

    try:
        data = data.set_index("date")
    except:
        pass

    xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
    xgb_pipe = make_pipeline(xgb_regressor)

    warehouses = data["warehouse"].unique().tolist()

    for warehouse in warehouses:
        warehouse_data = data[data["warehouse"] == warehouse]

        warehouse_data_X = warehouse_data.drop(columns=["orders", "warehouse"])
        warehouse_data_y = warehouse_data["orders"]

        the_model = randomized_hyper_tuning(xgb_pipe, warehouse_data_X, warehouse_data_y)

        final_score = abs(cross_val_score(the_model, warehouse_data_X, warehouse_data_y, cv=5, scoring="neg_mean_absolute_percentage_error").mean())

        result_list.append(final_score)
        #print(warehouse, final_score)

    print(np.mean(result_list))