In [1]:
import os

# Get the directory containing the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

# Change working directory to notebook folder
os.chdir(notebook_dir)

# Verify
print("Current working directory:", os.getcwd())

Current working directory: /home/j/jl1416/sta521/521PredictionProject-1


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import TargetEncoder  # sklearn's TargetEncoder
from xgboost import XGBClassifier
import optuna

# -----------------------------
# Load cleaned data
# -----------------------------
train = pd.read_csv('data/data_cleaned/train_clean.csv')
holdout = pd.read_csv('data/data_cleaned/holdout_cleaned.csv')

y_train = train['damage_grade']
X_train = train.drop(columns=['damage_grade'])

y_holdout = holdout['damage_grade']
X_holdout = holdout.drop(columns=['damage_grade'])

# Features to target‐encode
geo_target = ['geo__geo_level_2_id', 'geo__geo_level_3_id']

In [None]:
# -----------------------------
# Define Optuna objective
# -----------------------------
def objective(trial):
    # Hyperparameters for XGBoost
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'eval_metric': 'mlogloss',
        'random_state': 42
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr = X_train.iloc[train_idx].copy()
        X_val = X_train.iloc[val_idx].copy()
        y_tr = y_train.iloc[train_idx]
        y_val = y_train.iloc[val_idx]

        # Use sklearn TargetEncoder
        te = TargetEncoder(cv=5, shuffle=True, random_state=42, target_type = "multiclass")
        # Note: fit_transform does *cross‑fitting* internally in sklearn version.
        X_tr_enc = te.fit_transform(X_tr[geo_target], y_tr)
        X_val_enc = te.transform(X_val[geo_target])

        # Replace original geo columns with encoded ones
        X_tr_enc = pd.DataFrame(X_tr_enc, index=X_tr.index, columns=te.get_feature_names_out(geo_target))
        X_val_enc = pd.DataFrame(X_val_enc, index=X_val.index, columns=te.get_feature_names_out(geo_target))

        X_tr_full = X_tr.copy()
        X_val_full = X_val.copy()
        X_tr_full.drop(columns=geo_target, inplace=True)
        X_val_full.drop(columns=geo_target, inplace=True)
        # concat encoded
        X_tr_full = pd.concat([X_tr_full, X_tr_enc], axis=1)
        X_val_full = pd.concat([X_val_full, X_val_enc], axis=1)

        # Train model
        model = XGBClassifier(**params,
                              n_jobs=1,)
        model.fit(X_tr_full, y_tr)

        acc = model.score(X_val_full, y_val)
        cv_scores.append(acc)

    # Return mean accuracy so Optuna maximizes it
    return np.mean(cv_scores)

# -----------------------------
# Run Optuna
# -----------------------------
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25, show_progress_bar=True,)

print("Best params:", study.best_params)
print("Best CV accuracy:", study.best_value)

[I 2025-11-21 17:05:07,097] A new study created in memory with name: no-name-1cd0303a-8686-4326-9ed8-0dc4fc371fbc


  0%|          | 0/25 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:06:03,305] Trial 0 finished with value: 0.741488888888889 and parameters: {'n_estimators': 216, 'max_depth': 3, 'learning_rate': 0.1797943611667039, 'subsample': 0.8048666776835782, 'colsample_bytree': 0.9902810568124285, 'gamma': 0.34000616371133163, 'reg_alpha': 1.1344582316681995, 'reg_lambda': 4.12774455480776}. Best is trial 0 with value: 0.741488888888889.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:06:47,870] Trial 1 finished with value: 0.7365388888888889 and parameters: {'n_estimators': 134, 'max_depth': 4, 'learning_rate': 0.04951598814960773, 'subsample': 0.9290817758125692, 'colsample_bytree': 0.7845664612615483, 'gamma': 4.962738981367995, 'reg_alpha': 0.17066069734840483, 'reg_lambda': 1.1528284919611491}. Best is trial 0 with value: 0.741488888888889.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:09:59,401] Trial 2 finished with value: 0.74075 and parameters: {'n_estimators': 676, 'max_depth': 6, 'learning_rate': 0.03018040904665458, 'subsample': 0.7099090350228995, 'colsample_bytree': 0.9817855240079914, 'gamma': 2.801687776208746, 'reg_alpha': 3.2851286521106315, 'reg_lambda': 4.14971966844554}. Best is trial 0 with value: 0.741488888888889.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:12:25,255] Trial 3 finished with value: 0.7402222222222222 and parameters: {'n_estimators': 299, 'max_depth': 9, 'learning_rate': 0.013526876851116984, 'subsample': 0.9676059443312598, 'colsample_bytree': 0.5667502105363655, 'gamma': 2.938400927994007, 'reg_alpha': 2.105033331258939, 'reg_lambda': 1.7756885543279533}. Best is trial 0 with value: 0.741488888888889.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:15:26,369] Trial 4 finished with value: 0.7421055555555556 and parameters: {'n_estimators': 918, 'max_depth': 9, 'learning_rate': 0.14567180141790112, 'subsample': 0.6256512093576918, 'colsample_bytree': 0.6548213826728209, 'gamma': 2.91437319172021, 'reg_alpha': 0.24198323718076953, 'reg_lambda': 0.2363192584019158}. Best is trial 4 with value: 0.7421055555555556.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:19:03,941] Trial 5 finished with value: 0.7423055555555556 and parameters: {'n_estimators': 926, 'max_depth': 3, 'learning_rate': 0.09364571036648253, 'subsample': 0.5974787285362662, 'colsample_bytree': 0.8125475766053756, 'gamma': 1.2270973794881073, 'reg_alpha': 4.423728705480192, 'reg_lambda': 3.0712933499055683}. Best is trial 5 with value: 0.7423055555555556.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:21:54,427] Trial 6 finished with value: 0.7438388888888889 and parameters: {'n_estimators': 639, 'max_depth': 5, 'learning_rate': 0.11701572806686, 'subsample': 0.9285065361431866, 'colsample_bytree': 0.5463955042432984, 'gamma': 0.31120567409514965, 'reg_alpha': 0.4058117143945189, 'reg_lambda': 1.0905233715772606}. Best is trial 6 with value: 0.7438388888888889.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:26:34,417] Trial 7 finished with value: 0.7444499999999998 and parameters: {'n_estimators': 570, 'max_depth': 9, 'learning_rate': 0.0253672399626943, 'subsample': 0.6323996848125915, 'colsample_bytree': 0.8278547250898303, 'gamma': 0.4433802124243946, 'reg_alpha': 3.5086057887998283, 'reg_lambda': 1.419519543696535}. Best is trial 7 with value: 0.7444499999999998.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:28:22,310] Trial 8 finished with value: 0.7413444444444445 and parameters: {'n_estimators': 560, 'max_depth': 9, 'learning_rate': 0.19179919805808332, 'subsample': 0.9616421665289978, 'colsample_bytree': 0.8535094811021986, 'gamma': 2.581786965395829, 'reg_alpha': 3.9266369551053337, 'reg_lambda': 0.7717459523903653}. Best is trial 7 with value: 0.7444499999999998.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:30:07,888] Trial 9 finished with value: 0.7404999999999999 and parameters: {'n_estimators': 310, 'max_depth': 8, 'learning_rate': 0.03886103707207365, 'subsample': 0.9280643608409453, 'colsample_bytree': 0.6873839529054662, 'gamma': 4.103708559918785, 'reg_alpha': 1.9010225440296296, 'reg_lambda': 3.049212771664991}. Best is trial 7 with value: 0.7444499999999998.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:33:30,301] Trial 10 finished with value: 0.739338888888889 and parameters: {'n_estimators': 451, 'max_depth': 7, 'learning_rate': 0.010017769856236959, 'subsample': 0.535915597836628, 'colsample_bytree': 0.9009540838984895, 'gamma': 1.392583658142459, 'reg_alpha': 3.093317221154658, 'reg_lambda': 2.327575037027795}. Best is trial 7 with value: 0.7444499999999998.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:37:18,027] Trial 11 finished with value: 0.7411777777777777 and parameters: {'n_estimators': 717, 'max_depth': 5, 'learning_rate': 0.022962236240665568, 'subsample': 0.8050875529334027, 'colsample_bytree': 0.5269211058513754, 'gamma': 0.3390991180989234, 'reg_alpha': 4.984857538301879, 'reg_lambda': 1.4790433060124986}. Best is trial 7 with value: 0.7444499999999998.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:40:24,746] Trial 12 finished with value: 0.7444944444444445 and parameters: {'n_estimators': 735, 'max_depth': 6, 'learning_rate': 0.0824041797598461, 'subsample': 0.7147325598905802, 'colsample_bytree': 0.6793681391774947, 'gamma': 1.3011253659765392, 'reg_alpha': 1.38880845467131, 'reg_lambda': 0.11039550695243339}. Best is trial 12 with value: 0.7444944444444445.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:43:51,723] Trial 13 finished with value: 0.7440222222222223 and parameters: {'n_estimators': 788, 'max_depth': 10, 'learning_rate': 0.0734632090088986, 'subsample': 0.6798377445243565, 'colsample_bytree': 0.7056994369619912, 'gamma': 1.4831778749356919, 'reg_alpha': 1.3758401615498277, 'reg_lambda': 0.03894141417620722}. Best is trial 12 with value: 0.7444944444444445.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:46:55,119] Trial 14 finished with value: 0.7411888888888889 and parameters: {'n_estimators': 450, 'max_depth': 7, 'learning_rate': 0.018609030483122947, 'subsample': 0.780169063602023, 'colsample_bytree': 0.6210791998540476, 'gamma': 1.883410456912016, 'reg_alpha': 2.8315585993595125, 'reg_lambda': 0.5422744374676416}. Best is trial 12 with value: 0.7444944444444445.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:51:23,112] Trial 15 finished with value: 0.7386166666666667 and parameters: {'n_estimators': 830, 'max_depth': 6, 'learning_rate': 0.2638804360317435, 'subsample': 0.5090592064579826, 'colsample_bytree': 0.7538503861377494, 'gamma': 0.9486110925130917, 'reg_alpha': 3.7999920319435936, 'reg_lambda': 1.9136569693291738}. Best is trial 12 with value: 0.7444944444444445.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 17:53:44,540] Trial 16 finished with value: 0.7428666666666667 and parameters: {'n_estimators': 527, 'max_depth': 8, 'learning_rate': 0.06417723640270849, 'subsample': 0.6430523753913525, 'colsample_bytree': 0.853313759485391, 'gamma': 2.1526153289614998, 'reg_alpha': 2.1974382356888857, 'reg_lambda': 2.6412487828798925}. Best is trial 12 with value: 0.7444944444444445.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 18:00:03,953] Trial 17 finished with value: 0.7448055555555555 and parameters: {'n_estimators': 991, 'max_depth': 10, 'learning_rate': 0.04021566907442413, 'subsample': 0.7222001663769937, 'colsample_bytree': 0.7325414855630985, 'gamma': 0.7519810413502865, 'reg_alpha': 1.304179581833659, 'reg_lambda': 4.93220580005576}. Best is trial 17 with value: 0.7448055555555555.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 18:05:04,727] Trial 18 finished with value: 0.7451722222222223 and parameters: {'n_estimators': 841, 'max_depth': 10, 'learning_rate': 0.039885177267947027, 'subsample': 0.7210046014157027, 'colsample_bytree': 0.61136650823022, 'gamma': 0.8957128746953156, 'reg_alpha': 1.016789949660241, 'reg_lambda': 4.812323005334775}. Best is trial 18 with value: 0.7451722222222223.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 18:10:38,499] Trial 19 finished with value: 0.7450666666666668 and parameters: {'n_estimators': 988, 'max_depth': 10, 'learning_rate': 0.038921554461787265, 'subsample': 0.7567532128843691, 'colsample_bytree': 0.6076427993249945, 'gamma': 0.8278179519671592, 'reg_alpha': 0.8914778523524318, 'reg_lambda': 4.978865169177517}. Best is trial 18 with value: 0.7451722222222223.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 18:13:49,267] Trial 20 finished with value: 0.7414444444444445 and parameters: {'n_estimators': 852, 'max_depth': 10, 'learning_rate': 0.05142371031142157, 'subsample': 0.8418695666014224, 'colsample_bytree': 0.5983415186982628, 'gamma': 3.5834626533636476, 'reg_alpha': 0.801911072129917, 'reg_lambda': 4.8042375371595085}. Best is trial 18 with value: 0.7451722222222223.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 18:19:29,665] Trial 21 finished with value: 0.74495 and parameters: {'n_estimators': 999, 'max_depth': 10, 'learning_rate': 0.0373355336305295, 'subsample': 0.7507902048064113, 'colsample_bytree': 0.6239914375838714, 'gamma': 0.8622140772570133, 'reg_alpha': 0.8954745086272433, 'reg_lambda': 4.834752296116963}. Best is trial 18 with value: 0.7451722222222223.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 18:23:50,188] Trial 22 finished with value: 0.7431277777777778 and parameters: {'n_estimators': 977, 'max_depth': 10, 'learning_rate': 0.030431384336951545, 'subsample': 0.8525250349626231, 'colsample_bytree': 0.6121378615691558, 'gamma': 2.048485379514596, 'reg_alpha': 0.6850300144175397, 'reg_lambda': 4.241373331292286}. Best is trial 18 with value: 0.7451722222222223.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 18:30:06,055] Trial 23 finished with value: 0.7443888888888889 and parameters: {'n_estimators': 899, 'max_depth': 8, 'learning_rate': 0.01843040760076561, 'subsample': 0.7580526473906376, 'colsample_bytree': 0.5002508793698, 'gamma': 0.053848559715546473, 'reg_alpha': 1.7481508446411889, 'reg_lambda': 3.6386748124193686}. Best is trial 18 with value: 0.7451722222222223.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-21 18:34:49,474] Trial 24 finished with value: 0.7453555555555555 and parameters: {'n_estimators': 990, 'max_depth': 10, 'learning_rate': 0.04321824739429615, 'subsample': 0.875086850408032, 'colsample_bytree': 0.6474727673680003, 'gamma': 0.8126279635172744, 'reg_alpha': 0.8280296681292646, 'reg_lambda': 4.603013353075994}. Best is trial 24 with value: 0.7453555555555555.
Best params: {'n_estimators': 990, 'max_depth': 10, 'learning_rate': 0.04321824739429615, 'subsample': 0.875086850408032, 'colsample_bytree': 0.6474727673680003, 'gamma': 0.8126279635172744, 'reg_alpha': 0.8280296681292646, 'reg_lambda': 4.603013353075994}
Best CV accuracy: 0.7453555555555555


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# -----------------------------
# Train final model on full train set with target encoding
# -----------------------------
te_final = TargetEncoder(cv=5, shuffle=True, random_state=42)
X_train_enc = te_final.fit_transform(X_train[geo_target], y_train)
X_holdout_enc = te_final.transform(X_holdout[geo_target])

X_train_enc = pd.DataFrame(X_train_enc, index=X_train.index, columns=te_final.get_feature_names_out(geo_target))
X_holdout_enc = pd.DataFrame(X_holdout_enc, index=X_holdout.index, columns=te_final.get_feature_names_out(geo_target))

X_train_full = X_train.copy()
X_holdout_full = X_holdout.copy()
X_train_full.drop(columns=geo_target, inplace=True)
X_holdout_full.drop(columns=geo_target, inplace=True)

X_train_full = pd.concat([X_train_full, X_train_enc], axis=1)
X_holdout_full = pd.concat([X_holdout_full, X_holdout_enc], axis=1)

final_model = XGBClassifier(**study.best_params, eval_metric='mlogloss', random_state=42, n_jobs=1)
final_model.fit(X_train_full, y_train)

# -----------------------------
# Evaluate on holdout
# -----------------------------
holdout_acc = final_model.score(X_holdout_full, y_holdout)
print("Holdout accuracy:", holdout_acc)




Holdout accuracy: 0.74925


In [12]:
import pickle

with open("artifacts/final_model_xgb.pkl", "wb") as f:
    pickle.dump(final_model, f)