In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from helper import DateToOrdinal

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import cross_val_score

import xgboost as xgb

from sklearn.model_selection import GridSearchCV

In [2]:
data_folder = "data"
filename = "train.csv"
df = pd.read_csv(os.path.join("..", data_folder, filename))#, parse_dates=True, index_col="date")

def create_lagged_features(df, value, lags):
    for lag in range(1, lags + 1):
        df[f't-{lag}'] = df[value].shift(lag)
    df.dropna(inplace=True)
    return df

df = df[["orders", "warehouse"]]

df_lagged = create_lagged_features(df, "orders", lags=14)


In [3]:
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
xgb_pipe = make_pipeline(xgb_regressor)

params = {
    'xgbregressor__gamma': 0,
    'xgbregressor__learning_rate': 0.01,
    'xgbregressor__max_depth': 5,
    'xgbregressor__min_child_weight': 5,
    'xgbregressor__n_estimators': 500
}

xgb_pipe.set_params(**params)

In [4]:
warehouse_Prague_1 = df_lagged[df_lagged["warehouse"] == "Prague_1"]

Prague_1_data_X = df_lagged.drop(columns=["orders", "warehouse"])
Prague_1_data_y = df_lagged["orders"]

In [5]:
# Define the parameter grid
param_grid = {
    'xgbregressor__learning_rate': [0.01, 0.05, 0.1],
    'xgbregressor__n_estimators': [100, 200, 500],
    'xgbregressor__max_depth': [3, 5, 7],
    'xgbregressor__min_child_weight': [1, 3, 5],
    'xgbregressor__gamma': [0, 0.1, 0.2],

}

# Set up the GridSearchCV
grid_search = GridSearchCV(xgb_pipe, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', verbose=1, n_jobs=-1)

# Assuming X_train and y_train are your training data
grid_search.fit(Prague_1_data_X, Prague_1_data_y)

# Output the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", abs(grid_search.best_score_))


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found:  {'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__reg_alpha': 0.01, 'xgbregressor__reg_lambda': 1.5, 'xgbregressor__subsample': 0.8}
Best score:  0.07130893852399386


In [7]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Define the parameter distribution
param_dist = {
    'xgbregressor__learning_rate': uniform(0.01, 0.1),  # Uniform distribution between 0.01 and 0.11
    'xgbregressor__n_estimators': randint(100, 500),  # Random integers between 100 and 500
    'xgbregressor__max_depth': randint(3, 8),  # Random integers between 3 and 8
    'xgbregressor__min_child_weight': randint(1, 6),  # Random integers between 1 and 6
    'xgbregressor__gamma': uniform(0, 0.2),  # Uniform distribution between 0 and 0.2
}

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    xgb_pipe,
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings that are sampled
    cv=5,
    scoring='neg_mean_absolute_percentage_error',
    verbose=1,
    n_jobs=-1
    #random_state=42  # For reproducibility
)

# Assuming Prague_1_data_X and Prague_1_data_y are your training data
random_search.fit(Prague_1_data_X, Prague_1_data_y)

# Output the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best score: ", abs(random_search.best_score_))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'xgbregressor__gamma': 0.03583253687022563, 'xgbregressor__learning_rate': 0.015697148708578003, 'xgbregressor__max_depth': 7, 'xgbregressor__min_child_weight': 2, 'xgbregressor__n_estimators': 334}
Best score:  0.06911402998425314
