In [None]:
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt 
import numpy as np



import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from xgboost import XGBRegressor
from ydata_profiling import ProfileReport


from scalecast.Forecaster import Forecaster
alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [2]:
import optuna

## Import Data

In [3]:
train = pd.read_csv('train.csv')

In [6]:
data_profile = ProfileReport(train)

In [4]:
def feature_eng(df):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])

    # Extract date features
    df['day_of_year'] = df['date'].dt.dayofyear
    df['day_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year

    # Encode categorical features
    le = LabelEncoder()
    df['country'] = le.fit_transform(df['country'])
    df['store'] = le.fit_transform(df['store'])
    df['product'] = le.fit_transform(df['product'])

    # Drop missing values and irrelevant columns
    df = df.dropna()
    df = df.set_index('date')
    df = df.drop(columns=['id'])  # Dropping columns that are not useful for modeling
    # Separate features and target
    X = df.drop(columns=['num_sold'])
    y = df['num_sold']

    # Scale features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    return df, X,  y

# Feature engineering
df, X, y = feature_eng(train)

# Splitting the data
SPLIT = 0.85
split_index = int(SPLIT * len(X))

X_train = X[:split_index]
y_train = y[:split_index]
X_test = X[split_index:]
y_test = y[split_index:]

# Verify shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (188070, 7)
y_train shape: (188070,)
X_test shape: (33189, 7)
y_test shape: (33189,)


In [11]:
param_grid = {"max_depth":    [4, 5, 6, 12, 24],
              "n_estimators": [100, 500, 600, 700],
              "learning_rate": [0.1, 0.01, 0.015], 
              "tree_method":["hist"], 
              "device":["cuda"]
              }
regressor=xgb.XGBRegressor(eval_metric='rmsle')
# try out every combination of the above values
search = GridSearchCV(regressor, 
                      param_grid, 
                      cv=5
                      ).fit(X_train, 
                            y_train)

regressor=xgb.XGBRegressor(learning_rate = search.best_params_["learning_rate"],
                           n_estimators  = search.best_params_["n_estimators"],
                           max_depth     = search.best_params_["max_depth"],
                           eval_metric='rmsle')

regressor.fit(X_train, y_train)


print("The best hyperparameters are ",search.best_params_)

KeyboardInterrupt: 

In [36]:
predictions = regressor.predict(X_test)

In [37]:
RMSLE = np.sqrt( mean_absolute_percentage_error(y_test, predictions) )
print("The score is %.5f" % RMSLE )

The score is 0.31850


In [19]:
def objective(trial):
    print(f"🔄 Starting Trial {trial.number + 1} out of {20}")  # Replace 20 with your actual n_trials

    # Define hyperparameters
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 18),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 10.0),
        "device": "cuda",  # Ensure GPU usage
        "tree_method": "hist"  # Recommended for GPU acceleration in XGBoost 2.0+
    }

    # Train the model
    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train)

    # Predict & compute MAPE
    preds = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, preds)

    print(f"✅ Completed Trial {trial.number + 1} - MAPE: {mape:.4f}")

    return mape

# Run Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)  # ✅ Use explicit n_trials here


[I 2025-01-29 17:21:22,847] A new study created in memory with name: no-name-5ef57cc6-b563-4dc9-9cfb-45573669b877


🔄 Starting Trial 1 out of 20


[I 2025-01-29 17:21:27,068] Trial 0 finished with value: 0.4082032866181778 and parameters: {'n_estimators': 105, 'learning_rate': 0.034467629164132346, 'max_depth': 17, 'subsample': 0.8596836402133979, 'colsample_bytree': 0.7726746075850204, 'reg_alpha': 9.937625254355318, 'reg_lambda': 9.798143877732276}. Best is trial 0 with value: 0.4082032866181778.


✅ Completed Trial 1 - MAPE: 0.4082
🔄 Starting Trial 2 out of 20


[I 2025-01-29 17:21:29,464] Trial 1 finished with value: 0.0841290418744912 and parameters: {'n_estimators': 116, 'learning_rate': 0.0631294233952089, 'max_depth': 11, 'subsample': 0.9334339680144552, 'colsample_bytree': 0.8826002336801324, 'reg_alpha': 2.266637737382796, 'reg_lambda': 8.279001593909049}. Best is trial 1 with value: 0.0841290418744912.


✅ Completed Trial 2 - MAPE: 0.0841
🔄 Starting Trial 3 out of 20


[I 2025-01-29 17:21:31,519] Trial 2 finished with value: 0.6871599560594778 and parameters: {'n_estimators': 68, 'learning_rate': 0.10516951456080932, 'max_depth': 15, 'subsample': 0.9678571832706913, 'colsample_bytree': 0.6563147862836144, 'reg_alpha': 3.6252767752982904, 'reg_lambda': 6.3639799735533105}. Best is trial 1 with value: 0.0841290418744912.


✅ Completed Trial 3 - MAPE: 0.6872
🔄 Starting Trial 4 out of 20


[I 2025-01-29 17:21:34,928] Trial 3 finished with value: 0.23833466400167072 and parameters: {'n_estimators': 82, 'learning_rate': 0.19998375741966581, 'max_depth': 15, 'subsample': 0.7833898049134594, 'colsample_bytree': 0.6940134327392335, 'reg_alpha': 3.648969537042912, 'reg_lambda': 8.204228530266064}. Best is trial 1 with value: 0.0841290418744912.


✅ Completed Trial 4 - MAPE: 0.2383
🔄 Starting Trial 5 out of 20


[I 2025-01-29 17:21:36,261] Trial 4 finished with value: 0.10987926137882316 and parameters: {'n_estimators': 53, 'learning_rate': 0.08974027851137567, 'max_depth': 11, 'subsample': 0.9774975954165681, 'colsample_bytree': 0.9543872762503521, 'reg_alpha': 4.714620210876589, 'reg_lambda': 0.9803809485605001}. Best is trial 1 with value: 0.0841290418744912.


✅ Completed Trial 5 - MAPE: 0.1099
🔄 Starting Trial 6 out of 20


[I 2025-01-29 17:21:39,081] Trial 5 finished with value: 0.22343427784708228 and parameters: {'n_estimators': 120, 'learning_rate': 0.1526530369676776, 'max_depth': 12, 'subsample': 0.5726988550614781, 'colsample_bytree': 0.6436627865544184, 'reg_alpha': 4.704590912179203, 'reg_lambda': 3.7351571803809103}. Best is trial 1 with value: 0.0841290418744912.


✅ Completed Trial 6 - MAPE: 0.2234
🔄 Starting Trial 7 out of 20


In [26]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),
        "tree_method": "gpu_hist",  # GPU acceleration
        "predictor": "gpu_predictor",
    }

    model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, **params)

    evals = [(X_test, y_test)]
    
    model.fit(
        X_train, y_train, 
        early_stopping_rounds=20,  # Stops training if no improvement
        verbose=False
    )
    
    y_pred = model.predict(X_val)
    return mean_absolute_percentage_error(y_val, y_pred)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

best_params = study.best_params
print("Best Parameters:", best_params)


[I 2025-01-29 16:03:27,105] A new study created in memory with name: no-name-d4676eec-2693-4456-9a70-b15157d88ba9
[W 2025-01-29 16:03:27,108] Trial 0 failed with parameters: {'n_estimators': 256, 'learning_rate': 0.17142437944689945, 'max_depth': 8, 'subsample': 0.8200549705393854, 'colsample_bytree': 0.5537958292120306, 'reg_alpha': 8.199926501817226, 'reg_lambda': 0.6956843754967157} because of the following error: TypeError("XGBModel.fit() got an unexpected keyword argument 'evals'").
Traceback (most recent call last):
  File "c:\Users\Leo\.pyenv\pyenv-win-venv\envs\venv_kaggle\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Leo\AppData\Local\Temp\ipykernel_21908\1649746809.py", line 18, in objective
    model.fit(
  File "c:\Users\Leo\.pyenv\pyenv-win-venv\envs\venv_kaggle\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
TypeError: XGBModel.fit() got an unexpected keyword argu

TypeError: XGBModel.fit() got an unexpected keyword argument 'evals'

In [None]:
# Train with optimized parameters
best_xgb = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
best_xgb.fit(X_train, y_train)

# Predict on test data
y_pred = best_xgb.predict(X_test)

# Compute MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Optimized Test MAPE: {mape:.4f}")