<a href="https://colab.research.google.com/github/EWHA-AI24-Project/Weather-to-Wildfire-Prediction/blob/main/notebooks/model_2/model_tuning_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score
import numpy as np
from scipy.stats import randint

# Îç∞Ïù¥ÌÑ∞ Î°úÎìú
data = pd.read_csv('Data2_train.csv')
X_train = data.drop(columns=['FIRE_START_DAY'])
y_train = data['FIRE_START_DAY']
y_train = y_train.astype(int)


In [None]:
# Ïª§Ïä§ÌÖÄ Ï†êÏàò Ìï®Ïàò
def custom_score(y_true, y_pred_proba, **kwargs):
    pr_auc = average_precision_score(y_true, y_pred_proba)
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    return 0.5 * pr_auc + 0.5 * roc_auc

# Scorer Í∞ùÏ≤¥Î°ú Î≥ÄÌôò
custom_scorer = make_scorer(custom_score, needs_proba=True)

In [None]:
# === Ï≤´ Î≤àÏß∏ ÎûúÎç§ÏÑúÏπò ===

# ÎûúÎç§Ìè¨Î†àÏä§Ìä∏ Î™®Îç∏ Ï†ïÏùò
rf = RandomForestClassifier(random_state=42)

param_dist = {
    'n_estimators': [120, 150, 180, 200, 220],
    'max_depth': [8, 10, 15, 20],
    'min_samples_split': [8, 10, 12, 15],
    'min_samples_leaf': [3, 4, 5],
    'max_features': ['sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 4. ÎûúÎç§ÏÑúÏπò Ïã§Ìñâ
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=90,
    scoring=custom_scorer,
    cv=cv,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    error_score='raise'
)

random_search.fit(X_train, y_train)

# 1. ÏµúÏ†Å Î™®Îç∏ Í∞ÄÏ†∏Ïò§Í∏∞
best_params = random_search.best_params_
print("üîß Best Params from Random Search:", best_params)

# 2. ÏµúÏ†Å ÌååÎùºÎØ∏ÌÑ∞Î°ú Î™®Îç∏ Ïû¨Ï†ïÏùò
best_rf = RandomForestClassifier(**best_params, random_state=42)

# 3. ÍµêÏ∞®Í≤ÄÏ¶ù ÏÑ∏ÌåÖ
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

roc_auc_scores = []
pr_auc_scores = []

# 4. ÍµêÏ∞®Í≤ÄÏ¶ù Î∞òÎ≥µ
for train_idx, val_idx in cv.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    best_rf.fit(X_tr, y_tr)
    y_val_proba = best_rf.predict_proba(X_val)[:, 1]

    roc_auc = roc_auc_score(y_val, y_val_proba)
    pr_auc = average_precision_score(y_val, y_val_proba)

    roc_auc_scores.append(roc_auc)
    pr_auc_scores.append(pr_auc)

# 5. ÌèâÍ∑† Ï†êÏàò Í≥ÑÏÇ∞
mean_roc_auc = np.mean(roc_auc_scores)
mean_pr_auc = np.mean(pr_auc_scores)
mean_score = (mean_roc_auc + mean_pr_auc) / 2

# 6. Ï∂úÎ†•
print("üìå Cross-Validated ROC AUC :", mean_roc_auc)
print("üìå Cross-Validated PR AUC  :", mean_pr_auc)
print("üéØ Mean Score (ROC + PR AUC)/2:", mean_score)


Fitting 5 folds for each of 90 candidates, totalling 450 fits
üîß Best Params from Random Search: {'n_estimators': 120, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10}
üìå Cross-Validated ROC AUC : 0.8194828725603797
üìå Cross-Validated PR AUC  : 0.6590240196514553
üéØ Mean Score (ROC + PR AUC)/2: 0.7392534461059175


In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m107.8/107.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [None]:
# === Î≤†Ïù¥ÏßÄÏïà ÏµúÏ†ÅÌôî ===
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score

# ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌÉêÏÉâ Í≥µÍ∞Ñ (ÎûúÎç§ÏÑúÏπò Í≤∞Í≥º Í∏∞Î∞ò Î≤îÏúÑ ÏÑ§Ï†ï)
param_space = {
    'n_estimators': Integer(100, 200),
    'max_depth': Integer(8, 15),
    'min_samples_split': Integer(10, 20),
    'min_samples_leaf': Integer(3, 6),
    'max_features': Categorical(['sqrt', 'log2']),
    'bootstrap': Categorical([True, False])
}

# Î™®Îç∏ Î∞è Î≤†Ïù¥ÏßÄÏïà ÏµúÏ†ÅÌôî ÏÑ§Ï†ï
rf = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

bayes_search = BayesSearchCV(
    estimator=rf,
    search_spaces=param_space,
    scoring=custom_scorer,
    cv=cv,
    n_iter=100,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    error_score='raise'
)

# Î≤†Ïù¥ÏßÄÏïà ÏµúÏ†ÅÌôî Ïã§Ìñâ
bayes_search.fit(X_train, y_train)

print("‚úÖ Best CV Score (Custom):", bayes_search.best_score_)
print("üîß Best Parameters:", bayes_search.best_params_)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np

# ÏµúÏ†Å ÌååÎùºÎØ∏ÌÑ∞Î°ú Î™®Îç∏ Ïû¨Ï†ïÏùò
best_params = bayes_search.best_params_
best_rf = RandomForestClassifier(**best_params, random_state=42)

roc_auc_scores = []
pr_auc_scores = []

for train_idx, val_idx in cv.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    best_rf.fit(X_tr, y_tr)
    y_val_proba = best_rf.predict_proba(X_val)[:, 1]

    roc_auc = roc_auc_score(y_val, y_val_proba)
    pr_auc = average_precision_score(y_val, y_val_proba)

    roc_auc_scores.append(roc_auc)
    pr_auc_scores.append(pr_auc)

# ÌèâÍ∑† Ï†êÏàò Í≥ÑÏÇ∞
mean_roc_auc = np.mean(roc_auc_scores)
mean_pr_auc = np.mean(pr_auc_scores)
mean_score = (mean_roc_auc + mean_pr_auc) / 2

# Í≤∞Í≥º Ï∂úÎ†•
print("üìå Cross-Validated ROC AUC :", mean_roc_auc)
print("üìå Cross-Validated PR AUC  :", mean_pr_auc)
print("üéØ Mean Score (ROC + PR AUC)/2:", mean_score)


üìå Cross-Validated ROC AUC : 0.8178557332959564
üìå Cross-Validated PR AUC  : 0.6557405739821421
üéØ Mean Score (ROC + PR AUC)/2: 0.7367981536390493
