In [1]:
import os

# Get the directory containing the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

# Change working directory to notebook folder
os.chdir(notebook_dir)

# Verify
print("Current working directory:", os.getcwd())

Current working directory: /home/j/jl1416/sta521/521PredictionProject-1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import TargetEncoder  
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import cohen_kappa_score

# -----------------------------
# Load cleaned data
# -----------------------------
train = pd.read_csv('data/cleaned/train_cleaned.csv')
holdout = pd.read_csv('data/cleaned/test_cleaned.csv')

y_train = train['damage_grade']
X_train = train.drop(columns=['damage_grade'])

y_holdout = holdout['damage_grade']
X_holdout = holdout.drop(columns=['damage_grade'])

# Features to target‐encode
geo_target = ['geo__geo_level_2_id', 'geo__geo_level_3_id']

In [None]:
# -----------------------------
# Define Optuna objective
# -----------------------------
def objective(trial):
    # Hyperparameters for XGBoost
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 6),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 6),
        'eval_metric': 'mlogloss',
        'random_state': 42
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr = X_train.iloc[train_idx].copy()
        X_val = X_train.iloc[val_idx].copy()
        y_tr = y_train.iloc[train_idx]
        y_val = y_train.iloc[val_idx]

        # Use sklearn TargetEncoder
        te = TargetEncoder(cv=5, shuffle=True, random_state=42, target_type = "multiclass")
        # Note: fit_transform does *cross‑fitting* internally in sklearn version.
        X_tr_enc = te.fit_transform(X_tr[geo_target], y_tr)
        X_val_enc = te.transform(X_val[geo_target])

        # Replace original geo columns with encoded ones
        X_tr_enc = pd.DataFrame(X_tr_enc, index=X_tr.index, columns=te.get_feature_names_out(geo_target))
        X_val_enc = pd.DataFrame(X_val_enc, index=X_val.index, columns=te.get_feature_names_out(geo_target))

        X_tr_full = X_tr.copy()
        X_val_full = X_val.copy()
        X_tr_full.drop(columns=geo_target, inplace=True)
        X_val_full.drop(columns=geo_target, inplace=True)
        # concat encoded
        X_tr_full = pd.concat([X_tr_full, X_tr_enc], axis=1)
        X_val_full = pd.concat([X_val_full, X_val_enc], axis=1)

        # Train model
        model = XGBClassifier(**params,
                              n_jobs=1)
        model.fit(X_tr_full, y_tr)
        
        y_pred = model.predict(X_val_full)

        acc = cohen_kappa_score(y_pred, y_val, weights = "quadratic")
        cv_scores.append(acc)

    # Return mean QWK so Optuna maximizes it
    return np.mean(cv_scores)

# -----------------------------
# Run Optuna
# -----------------------------
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2, show_progress_bar=True, n_jobs=1)

print("Best params:", study.best_params)
print("Best CV QWK:", study.best_value)

[I 2025-11-22 19:23:07,508] A new study created in memory with name: no-name-18491ec8-29c7-417f-a2e7-738046dbebcf


  0%|          | 0/2 [00:00<?, ?it/s]

[I 2025-11-22 19:26:15,832] Trial 0 finished with value: 0.6073134670286588 and parameters: {'n_estimators': 865, 'max_depth': 8, 'learning_rate': 0.12514680474594667, 'subsample': 0.5553499007101381, 'colsample_bytree': 0.6466978272237709, 'gamma': 2.3627881423301567, 'reg_alpha': 3.952795379573212, 'reg_lambda': 0.3322304800896623}. Best is trial 0 with value: 0.6073134670286588.
[I 2025-11-22 19:29:50,189] Trial 1 finished with value: 0.600769088341466 and parameters: {'n_estimators': 480, 'max_depth': 11, 'learning_rate': 0.011165635344335475, 'subsample': 0.7067576444179209, 'colsample_bytree': 0.8703049837122702, 'gamma': 3.592196524918455, 'reg_alpha': 3.675466070293548, 'reg_lambda': 5.945418269681629}. Best is trial 0 with value: 0.6073134670286588.
Best params: {'n_estimators': 865, 'max_depth': 8, 'learning_rate': 0.12514680474594667, 'subsample': 0.5553499007101381, 'colsample_bytree': 0.6466978272237709, 'gamma': 2.3627881423301567, 'reg_alpha': 3.952795379573212, 'reg_lam

In [7]:
# -----------------------------
# Train final model on full train set with target encoding
# -----------------------------
te_final = TargetEncoder(cv=5, shuffle=True, random_state=42)
X_train_enc = te_final.fit_transform(X_train[geo_target], y_train)
X_holdout_enc = te_final.transform(X_holdout[geo_target])

X_train_enc = pd.DataFrame(X_train_enc, index=X_train.index, columns=te_final.get_feature_names_out(geo_target))
X_holdout_enc = pd.DataFrame(X_holdout_enc, index=X_holdout.index, columns=te_final.get_feature_names_out(geo_target))

X_train_full = X_train.copy()
X_holdout_full = X_holdout.copy()
X_train_full.drop(columns=geo_target, inplace=True)
X_holdout_full.drop(columns=geo_target, inplace=True)

X_train_full = pd.concat([X_train_full, X_train_enc], axis=1)
X_holdout_full = pd.concat([X_holdout_full, X_holdout_enc], axis=1)

final_model = XGBClassifier(**study.best_params, eval_metric='mlogloss', random_state=42, n_jobs=1)
final_model.fit(X_train_full, y_train)

# -----------------------------
# Evaluate on holdout
# -----------------------------
y_pred = final_model.predict(X_holdout_full)
holdout_acc = cohen_kappa_score(y_pred, y_holdout, weights = "quadratic")
print("Holdout QWK:", holdout_acc)


Holdout QWK: 0.6167245460135715


In [None]:
import pickle

with open("artifacts/final_model_xgb.pkl", "wb") as f:
    pickle.dump(final_model, f)