In [1]:
import os

# Get the directory containing the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

# Change working directory to notebook folder
os.chdir(notebook_dir)

# Verify
print("Current working directory:", os.getcwd())


Current working directory: /home/j/jl1416/sta521/521PredictionProject-1


In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score


# ============================================================
# 2. Load data
# ============================================================
df = pd.read_csv("data/nepal_dat.csv")

cat_cols = [
    'land_surface_condition',
    'foundation_type',
    'roof_type',
    'ground_floor_type',
    'other_floor_type',
    'position',
    'plan_configuration',
    'legal_ownership_status'
]

# Convert categorical → integer codes
for c in cat_cols:
    df[c] = df[c].astype('category').cat.codes

# ============================================================
# 3. Fix label issue (shift min to 0)
# ============================================================
y_raw = df.iloc[:, -1]
print("Original unique labels:", sorted(y_raw.unique()))

df.iloc[:, -1] = y_raw - y_raw.min()
y = df.iloc[:, -1].astype(int)
print("Fixed labels:", sorted(y.unique()))

X = df.iloc[:, :-1]

# ============================================================
# 4. Train/test split
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================================
# 5. Optuna objective — 3-fold CV, single-threaded for testing
# ============================================================
def objective(trial):

    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-4, 10.0),
        "alpha": trial.suggest_float("alpha", 1e-4, 10.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 10.0),
    }

    model = xgb.XGBClassifier(
        **params,
        n_estimators=300,         # fewer trees for faster testing
        tree_method="hist",
        n_jobs=1,                 # use 1 CPU to avoid cluster contention
        use_label_encoder=False
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    cv_acc = cross_val_score(
        model, X_train, y_train,
        cv=cv, scoring="accuracy"
    ).mean()

    return cv_acc

# ============================================================
# 6. Run Optuna with trial logging
# ============================================================
def logging_callback(study, trial):
    print(f"Trial {trial.number}: CV Accuracy = {trial.value:.4f}, Params = {trial.params}")

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, show_progress_bar=True, callbacks=[logging_callback])

print("\n========================")
print("Best trial:")
print("CV Accuracy:", study.best_value)
print("Best params:", study.best_params)
print("========================\n")

best_params = study.best_params

# ============================================================
# 7. Train final model on full training data
# ============================================================
final_model = xgb.XGBClassifier(
    **best_params,
    n_estimators=300,
    tree_method="hist",
    n_jobs=1,
    use_label_encoder=False
)
final_model.fit(X_train, y_train)

# ============================================================
# 8. Evaluate on held-out test set
# ============================================================
y_pred = final_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"Final Test Accuracy: {test_acc:.4f}")




[I 2025-11-20 12:41:42,505] A new study created in memory with name: no-name-56cf77ca-0f8c-43b6-b459-2b0b790c662d


Original unique labels: [np.int64(1), np.int64(2), np.int64(3)]
Fixed labels: [np.int64(0), np.int64(1), np.int64(2)]


  0%|          | 0/10 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:42:35,033] Trial 0 finished with value: 0.73668125 and parameters: {'eta': 0.16948006655570705, 'max_depth': 7, 'subsample': 0.9298761891465612, 'colsample_bytree': 0.8508328540335086, 'lambda': 4.601724855849726, 'alpha': 2.105291752500521, 'min_child_weight': 0.18327638378081637}. Best is trial 0 with value: 0.73668125.
Trial 0: CV Accuracy = 0.7367, Params = {'eta': 0.16948006655570705, 'max_depth': 7, 'subsample': 0.9298761891465612, 'colsample_bytree': 0.8508328540335086, 'lambda': 4.601724855849726, 'alpha': 2.105291752500521, 'min_child_weight': 0.18327638378081637}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:43:41,724] Trial 1 finished with value: 0.73663125 and parameters: {'eta': 0.27916151030457614, 'max_depth': 8, 'subsample': 0.7835800561880242, 'colsample_bytree': 0.9989389848733399, 'lambda': 4.500413890075214, 'alpha': 7.075351754755481, 'min_child_weight': 1.8438990127762869}. Best is trial 0 with value: 0.73668125.
Trial 1: CV Accuracy = 0.7366, Params = {'eta': 0.27916151030457614, 'max_depth': 8, 'subsample': 0.7835800561880242, 'colsample_bytree': 0.9989389848733399, 'lambda': 4.500413890075214, 'alpha': 7.075351754755481, 'min_child_weight': 1.8438990127762869}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:44:22,899] Trial 2 finished with value: 0.7167250000000001 and parameters: {'eta': 0.09898481180623188, 'max_depth': 5, 'subsample': 0.9029495216618082, 'colsample_bytree': 0.8997651349837517, 'lambda': 0.8846561529581534, 'alpha': 2.9014982223752317, 'min_child_weight': 1.5408787690937198}. Best is trial 0 with value: 0.73668125.
Trial 2: CV Accuracy = 0.7167, Params = {'eta': 0.09898481180623188, 'max_depth': 5, 'subsample': 0.9029495216618082, 'colsample_bytree': 0.8997651349837517, 'lambda': 0.8846561529581534, 'alpha': 2.9014982223752317, 'min_child_weight': 1.5408787690937198}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:45:22,334] Trial 3 finished with value: 0.7051000000000001 and parameters: {'eta': 0.022374358318878802, 'max_depth': 8, 'subsample': 0.9380170129604253, 'colsample_bytree': 0.5577973672616098, 'lambda': 3.0422804687532774, 'alpha': 9.407457768764939, 'min_child_weight': 8.453398054595645}. Best is trial 0 with value: 0.73668125.
Trial 3: CV Accuracy = 0.7051, Params = {'eta': 0.022374358318878802, 'max_depth': 8, 'subsample': 0.9380170129604253, 'colsample_bytree': 0.5577973672616098, 'lambda': 3.0422804687532774, 'alpha': 9.407457768764939, 'min_child_weight': 8.453398054595645}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:46:18,690] Trial 4 finished with value: 0.7294625 and parameters: {'eta': 0.1644967987097594, 'max_depth': 7, 'subsample': 0.5655741045626926, 'colsample_bytree': 0.583960876699295, 'lambda': 2.969069499188906, 'alpha': 8.799534171469828, 'min_child_weight': 6.30713135037035}. Best is trial 0 with value: 0.73668125.
Trial 4: CV Accuracy = 0.7295, Params = {'eta': 0.1644967987097594, 'max_depth': 7, 'subsample': 0.5655741045626926, 'colsample_bytree': 0.583960876699295, 'lambda': 2.969069499188906, 'alpha': 8.799534171469828, 'min_child_weight': 6.30713135037035}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:47:09,571] Trial 5 finished with value: 0.736325 and parameters: {'eta': 0.1788195141526628, 'max_depth': 7, 'subsample': 0.733866283897083, 'colsample_bytree': 0.5903945265155677, 'lambda': 1.6783424905827002, 'alpha': 1.0077032660581027, 'min_child_weight': 2.4022922447704156}. Best is trial 0 with value: 0.73668125.
Trial 5: CV Accuracy = 0.7363, Params = {'eta': 0.1788195141526628, 'max_depth': 7, 'subsample': 0.733866283897083, 'colsample_bytree': 0.5903945265155677, 'lambda': 1.6783424905827002, 'alpha': 1.0077032660581027, 'min_child_weight': 2.4022922447704156}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:47:58,246] Trial 6 finished with value: 0.7165375 and parameters: {'eta': 0.08012234817927963, 'max_depth': 6, 'subsample': 0.6837375987944434, 'colsample_bytree': 0.6458610170849806, 'lambda': 9.912962237683256, 'alpha': 7.540396380882968, 'min_child_weight': 8.733004989245659}. Best is trial 0 with value: 0.73668125.
Trial 6: CV Accuracy = 0.7165, Params = {'eta': 0.08012234817927963, 'max_depth': 6, 'subsample': 0.6837375987944434, 'colsample_bytree': 0.6458610170849806, 'lambda': 9.912962237683256, 'alpha': 7.540396380882968, 'min_child_weight': 8.733004989245659}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:48:50,983] Trial 7 finished with value: 0.72680625 and parameters: {'eta': 0.08812413152064309, 'max_depth': 7, 'subsample': 0.9261440060198507, 'colsample_bytree': 0.9206253101810523, 'lambda': 0.6104079672026451, 'alpha': 8.685591141364432, 'min_child_weight': 4.840281664936753}. Best is trial 0 with value: 0.73668125.
Trial 7: CV Accuracy = 0.7268, Params = {'eta': 0.08812413152064309, 'max_depth': 7, 'subsample': 0.9261440060198507, 'colsample_bytree': 0.9206253101810523, 'lambda': 0.6104079672026451, 'alpha': 8.685591141364432, 'min_child_weight': 4.840281664936753}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:49:28,853] Trial 8 finished with value: 0.706475 and parameters: {'eta': 0.10444813321484717, 'max_depth': 4, 'subsample': 0.8063915976799777, 'colsample_bytree': 0.8854280232449196, 'lambda': 5.91345416281043, 'alpha': 5.809701925052328, 'min_child_weight': 2.3576184617158593}. Best is trial 0 with value: 0.73668125.
Trial 8: CV Accuracy = 0.7065, Params = {'eta': 0.10444813321484717, 'max_depth': 4, 'subsample': 0.8063915976799777, 'colsample_bytree': 0.8854280232449196, 'lambda': 5.91345416281043, 'alpha': 5.809701925052328, 'min_child_weight': 2.3576184617158593}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 12:50:03,403] Trial 9 finished with value: 0.6937874999999999 and parameters: {'eta': 0.10142415547986305, 'max_depth': 3, 'subsample': 0.6529017116507811, 'colsample_bytree': 0.594091120362034, 'lambda': 5.332401272524328, 'alpha': 3.1924113796251334, 'min_child_weight': 8.557655034255982}. Best is trial 0 with value: 0.73668125.
Trial 9: CV Accuracy = 0.6938, Params = {'eta': 0.10142415547986305, 'max_depth': 3, 'subsample': 0.6529017116507811, 'colsample_bytree': 0.594091120362034, 'lambda': 5.332401272524328, 'alpha': 3.1924113796251334, 'min_child_weight': 8.557655034255982}

Best trial:
CV Accuracy: 0.73668125
Best params: {'eta': 0.16948006655570705, 'max_depth': 7, 'subsample': 0.9298761891465612, 'colsample_bytree': 0.8508328540335086, 'lambda': 4.601724855849726, 'alpha': 2.105291752500521, 'min_child_weight': 0.18327638378081637}



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Final Test Accuracy: 0.7393


In [3]:
print(sorted(df.iloc[:, -1].unique()))


[np.int64(0), np.int64(1), np.int64(2)]


In [4]:
import numpy as np

# Assuming y_train is a NumPy array
unique_values = np.unique(y_train)
unique_values

array([0, 1, 2])