In [5]:
import os

# Get the directory containing the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

# Change working directory to notebook folder
os.chdir(notebook_dir)

# Verify
print("Current working directory:", os.getcwd())


Current working directory: /home/j/jl1416/sta521/521PredictionProject-1


In [6]:
import numpy as np
import pandas as pd
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score


# ============================================================
# 2. Load data
# ============================================================
df = pd.read_csv("nepal_dat.csv")

cat_cols = [
    'land_surface_condition',
    'foundation_type',
    'roof_type',
    'ground_floor_type',
    'other_floor_type',
    'position',
    'plan_configuration',
    'legal_ownership_status'
]

# Convert categorical → integer codes
for c in cat_cols:
    df[c] = df[c].astype('category').cat.codes

# ============================================================
# 3. Fix label issue (shift min to 0)
# ============================================================
y_raw = df.iloc[:, -1]
print("Original unique labels:", sorted(y_raw.unique()))

df.iloc[:, -1] = y_raw - y_raw.min()
y = df.iloc[:, -1].astype(int)
print("Fixed labels:", sorted(y.unique()))

X = df.iloc[:, :-1]

# ============================================================
# 4. Train/test split
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================================
# 5. Optuna objective — 3-fold CV, single-threaded for testing
# ============================================================
def objective(trial):

    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-4, 10.0),
        "alpha": trial.suggest_float("alpha", 1e-4, 10.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-2, 10.0),
    }

    model = xgb.XGBClassifier(
        **params,
        n_estimators=100,         # fewer trees for faster testing
        tree_method="hist",
        n_jobs=1,                 # use 1 CPU to avoid cluster contention
        use_label_encoder=False
    )

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    cv_acc = cross_val_score(
        model, X_train, y_train,
        cv=cv, scoring="accuracy"
    ).mean()

    return cv_acc

# ============================================================
# 6. Run Optuna with trial logging
# ============================================================
def logging_callback(study, trial):
    print(f"Trial {trial.number}: CV Accuracy = {trial.value:.4f}, Params = {trial.params}")

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5, show_progress_bar=True, callbacks=[logging_callback])

print("\n========================")
print("Best trial:")
print("CV Accuracy:", study.best_value)
print("Best params:", study.best_params)
print("========================\n")

best_params = study.best_params

# ============================================================
# 7. Train final model on full training data
# ============================================================
final_model = xgb.XGBClassifier(
    **best_params,
    n_estimators=100,
    tree_method="hist",
    n_jobs=1,
    use_label_encoder=False
)
final_model.fit(X_train, y_train)

# ============================================================
# 8. Evaluate on held-out test set
# ============================================================
y_pred = final_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"Final Test Accuracy: {test_acc:.4f}")




[I 2025-11-20 10:28:13,163] A new study created in memory with name: no-name-b80457ef-cb06-42dc-9eb6-7c4ea1c5d539


Original unique labels: [np.int64(0), np.int64(1)]
Fixed labels: [np.int64(0), np.int64(1)]


  0%|          | 0/5 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 10:28:16,170] Trial 0 finished with value: 0.9987437487108474 and parameters: {'eta': 0.20380006294392666, 'max_depth': 6, 'subsample': 0.8323995749380602, 'colsample_bytree': 0.7793359594235184, 'lambda': 2.9825443660296482, 'alpha': 2.9945882485939728, 'min_child_weight': 3.434838955874113}. Best is trial 0 with value: 0.9987437487108474.
Trial 0: CV Accuracy = 0.9987, Params = {'eta': 0.20380006294392666, 'max_depth': 6, 'subsample': 0.8323995749380602, 'colsample_bytree': 0.7793359594235184, 'lambda': 2.9825443660296482, 'alpha': 2.9945882485939728, 'min_child_weight': 3.434838955874113}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 10:28:18,799] Trial 1 finished with value: 0.9981937480858084 and parameters: {'eta': 0.22366894497016698, 'max_depth': 8, 'subsample': 0.9623928343094416, 'colsample_bytree': 0.6463070985876935, 'lambda': 5.414232537861025, 'alpha': 8.587468980557295, 'min_child_weight': 7.971369572407363}. Best is trial 0 with value: 0.9987437487108474.
Trial 1: CV Accuracy = 0.9982, Params = {'eta': 0.22366894497016698, 'max_depth': 8, 'subsample': 0.9623928343094416, 'colsample_bytree': 0.6463070985876935, 'lambda': 5.414232537861025, 'alpha': 8.587468980557295, 'min_child_weight': 7.971369572407363}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 10:28:21,233] Trial 2 finished with value: 0.9972249991794758 and parameters: {'eta': 0.06355706535439878, 'max_depth': 4, 'subsample': 0.8093412651874884, 'colsample_bytree': 0.593496995729012, 'lambda': 5.668664984846425, 'alpha': 1.3495765462329596, 'min_child_weight': 8.84705598982478}. Best is trial 0 with value: 0.9987437487108474.
Trial 2: CV Accuracy = 0.9972, Params = {'eta': 0.06355706535439878, 'max_depth': 4, 'subsample': 0.8093412651874884, 'colsample_bytree': 0.593496995729012, 'lambda': 5.668664984846425, 'alpha': 1.3495765462329596, 'min_child_weight': 8.84705598982478}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 10:28:23,996] Trial 3 finished with value: 0.9983249996092466 and parameters: {'eta': 0.14654454397547997, 'max_depth': 5, 'subsample': 0.6293535284178464, 'colsample_bytree': 0.7231856585005059, 'lambda': 5.462286565052302, 'alpha': 6.288607871149431, 'min_child_weight': 0.8130435129896707}. Best is trial 0 with value: 0.9987437487108474.
Trial 3: CV Accuracy = 0.9983, Params = {'eta': 0.14654454397547997, 'max_depth': 5, 'subsample': 0.6293535284178464, 'colsample_bytree': 0.7231856585005059, 'lambda': 5.462286565052302, 'alpha': 6.288607871149431, 'min_child_weight': 0.8130435129896707}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[I 2025-11-20 10:28:26,908] Trial 4 finished with value: 0.998718749257717 and parameters: {'eta': 0.08742664355726355, 'max_depth': 7, 'subsample': 0.7681524269545084, 'colsample_bytree': 0.8716683494169302, 'lambda': 3.228741779154984, 'alpha': 5.212614436565548, 'min_child_weight': 1.7927814730665284}. Best is trial 0 with value: 0.9987437487108474.
Trial 4: CV Accuracy = 0.9987, Params = {'eta': 0.08742664355726355, 'max_depth': 7, 'subsample': 0.7681524269545084, 'colsample_bytree': 0.8716683494169302, 'lambda': 3.228741779154984, 'alpha': 5.212614436565548, 'min_child_weight': 1.7927814730665284}

Best trial:
CV Accuracy: 0.9987437487108474
Best params: {'eta': 0.20380006294392666, 'max_depth': 6, 'subsample': 0.8323995749380602, 'colsample_bytree': 0.7793359594235184, 'lambda': 2.9825443660296482, 'alpha': 2.9945882485939728, 'min_child_weight': 3.434838955874113}



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Final Test Accuracy: 0.9991


In [2]:
print(sorted(df.iloc[:, -1].unique()))


[np.int64(-1), np.int64(0)]


In [7]:
import numpy as np

# Assuming y_train is a NumPy array
unique_values = np.unique(y_train)
unique_values

array([0, 1])

In [None]:
import sklearn
print(pytz.__version__)




1.7.2
