In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, log_loss, classification_report
import optuna

# Load data

In [3]:
path = r"C:\Users\matta\Desktop\Documents\Python\Geolocation\climate_data\working_data\clean_labeled_climate_data.pkl"
df = pd.read_pickle(path)
df.shape

(470342, 125)

In [9]:
temp_precip_cols = ['longitude', 'latitude',
                    'jan_precip', 'feb_precip', 'mar_precip', 'apr_precip', 'may_precip', 'jun_precip', 
                    'jul_precip', 'aug_precip', 'sep_precip', 'oct_precip', 'nov_precip', 'dec_precip',  
                    'jan_meant', 'feb_meant', 'mar_meant', 'apr_meant', 'may_meant', 'jun_meant', 
                    'jul_meant', 'aug_meant', 'sep_meant', 'oct_meant', 'nov_meant', 'dec_meant']
select_cols = ['longitude', 'latitude', 'jan_tmin', 'jul_maxt', 'jan_dptmean', 'jul_dptmean', 'annual_precip']
target_cols = ['Level_1', 'Level_2', 'Level_3', 'Level_4', 'ECO_NAME', 'climates_f']

# Train-test split

In [19]:
X = df[temp_precip_cols + ['jan_tmin', 'jul_maxt', 'jan_dptmean', 'jul_dptmean', 'annual_precip']]
y = df.Level_1
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# EPA Level 1

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, stratify=y)

## XGBoost 

In [None]:
# Define Optuna objective function
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 2.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "tree_method": "hist",
        "objective": "multi:softprob",
        "num_class": len(np.unique(y_train)),
        "random_state": 42,
        "eval_metric": "mlogloss",
        "n_jobs": -1
    }

    model = XGBClassifier(**params)

    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
              verbose=False)

    # Predict probabilities
    y_pred_proba = model.predict_proba(X_test)
    y_pred = model.predict(X_test)

    # Evaluate performance
    logloss = log_loss(y_test, y_pred_proba)
    f1_macro = f1_score(y_test, y_pred, average="macro")

    # We minimize logloss but can return a combined objective
    # Optuna always minimizes, so we can return logloss or -f1 to maximize f1
    return logloss

# Run Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30, show_progress_bar=True)

# Best trial summary
print("Best Trial:")
print(f"  Log Loss: {study.best_value:.4f}")
print("  Best Params:")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

[I 2025-10-28 19:54:55,150] A new study created in memory with name: no-name-5964f2d2-1629-4a70-a21c-570a05091a20


  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
# Retrain final model with best params
best_params = study.best_params
best_model = XGBClassifier(
    **best_params,
    objective="multi:softprob",
    num_class=len(np.unique(y_train)),
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

best_model.fit(X_train, y_train)

# Evaluate final model
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

acc = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average="macro")
ll = log_loss(y_test, y_pred_proba)

print(f"\n✅ Final Accuracy: {acc:.3f}")
print(f"✅ Final F1 (macro): {f1_macro:.3f}")
print(f"✅ Final Log Loss: {ll:.3f}")