In [1]:
import os

# Get the directory containing the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

# Change working directory to notebook folder
os.chdir(notebook_dir)

# Verify
print("Current working directory:", os.getcwd())

Current working directory: /home/j/jl1416/sta521/521PredictionProject-1


In [2]:
import pandas as pd
from sklearn.preprocessing import TargetEncoder  
from xgboost import XGBClassifier
from sklearn.metrics import cohen_kappa_score,  accuracy_score

# -----------------------------
# Load cleaned data
# -----------------------------
train = pd.read_csv('data/cleaned/train_cleaned.csv')
holdout = pd.read_csv('data/cleaned/test_cleaned.csv')

data = pd.concat([train, holdout], axis = 0).reset_index(drop=True)

y = data['damage_grade']
X = data.drop(columns=['damage_grade'])

# Features to target‚Äêencode
geo_target = ['geo__geo_level_2_id', 'geo__geo_level_3_id']

In [3]:
import pickle
# Load trained XGBoost model to extract the parameters
with open("artifacts/train_model_xgb.pkl", "rb") as f:
    model = pickle.load(f)

best_params = model.get_params()
print(best_params)

{'objective': 'multi:softprob', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.5025969909985456, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'mlogloss', 'feature_types': None, 'feature_weights': None, 'gamma': 0.41127960629317606, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.05663399296149698, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 9, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 1258, 'n_jobs': 1, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': 1.0712376905472907, 'reg_lambda': 0.7143681038483615, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.8519852631186717, 'tree_method': None, 'validate_parameters': None, 'verbosity'

In [4]:
# -----------------------------
# Train final model on full data
# -----------------------------
te_final = TargetEncoder(cv=5, shuffle=True, random_state=42)
X_enc = te_final.fit_transform(X[geo_target], y)

X_enc = pd.DataFrame(X_enc, index=X.index, columns=te_final.get_feature_names_out(geo_target))

X_full = X.copy()
X_full.drop(columns=geo_target, inplace=True)

X_full = pd.concat([X_full, X_enc], axis=1)

final_model = XGBClassifier(**best_params)
final_model.fit(X_full, y)

# -----------------------------
# Evaluate on train
# -----------------------------
y_pred = final_model.predict(X_full)
train_qwk = cohen_kappa_score(y_pred, y, weights = "quadratic")
print("Train QWK:", train_qwk)

train_acc = accuracy_score(y_pred, y)
print("Train Accuracy:", train_acc)

Train QWK: 0.7466922478800936
Train Accuracy: 0.829575


In [5]:
with open("artifacts/final_target_encoder.pkl", "wb") as f:
    pickle.dump(te_final, f)

with open("artifacts/final_model_xgb.pkl", "wb") as f:
    pickle.dump(final_model, f)