# 0. Imports

In [None]:
# --- Notebook plotting ---
%matplotlib inline

# --- Standard library ---
from pathlib import Path

# --- Third-party libraries ---
import dagshub
import joblib
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import io

from imblearn.over_sampling import SMOTE
from skopt import BayesSearchCV
from skopt.plots import plot_objective, plot_histogram

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from scipy.stats import randint

# --- Project-specific imports ---
from src import dataset, preprocess
from src.config import RAW_DATA_DIR, INTERIM_DATA_DIR
from src.modeling import train, predict

# --- MLflow / DagsHub init ---
dagshub.init(repo_owner="joscha0610", repo_name="earthquake-damage-ml", mlflow=True)
from mlflow.tracking import MlflowClient

# --- Figure output directory ---
FIG_DIR = Path("../reports/figures")
FIG_DIR.mkdir(parents=True, exist_ok=True)


# 1. Load Data

In [None]:
# Competition (train/test/labels)
X, y, test = dataset.load_competition_raw(RAW_DATA_DIR)

# 2. Preprocess data

### 2.1 One-Hot-Encoding

In [None]:
X = preprocess.one_hot_encode(X)

### 2.2 Train Test Split
Split data in 80% trainings data and 20% validation data

In [None]:
X_train, X_val, y_train, y_val = preprocess.split_train_val(X, y, test_size=0.2, random_state=42)

# 3. Create Pipeline

In [None]:
pipe = train.build_rf_pipeline()

# 4. Hyperparameter tuning with Grid Search

### 4.1 Define parameter grid

In [None]:
# param_grid = {'randomforestclassifier__n_estimators': [100, 250, 500, 1000],
#             'randomforestclassifier__min_samples_leaf': [1, 2, 4],
#             'randomforestclassifier__max_features': [20, 25, 30, 35, 40],
#             'randomforestclassifier__max_depth': [10, 25, 50],
#             'randomforestclassifier__min_samples_split': [20, 25, 30, 35],
#             'randomforestclassifier__criterion': ['gini', 'entropy']}

### 4.2 Tune parameters

In [None]:
# # 1. Set up Grid Search
# grid = GridSearchCV(
#     estimator=pipe,
#     param_grid=param_grid,
#     scoring=make_scorer(f1_score, average="micro"),
#     cv=3
# )

# # 2. Create MLflow experiment
# experiment_name = "rf_grid_search"
# mlflow.set_experiment(experiment_name)

# # 3. Train model with Grid Search
# mlflow.autolog(log_models=False)  # we'll log the best model manually

# with mlflow.start_run(run_name="grid_search_rf") as run:
#     # Fit search
#     grid.fit(X_train, y_train["damage_grade"].values.ravel())

#     # Extract best results
#     best_model = grid.best_estimator_
#     best_score = grid.best_score_
#     best_params = grid.best_params_

#     # Log metrics and parameters
#     mlflow.log_metric("best_cv_f1_micro", best_score)
#     mlflow.log_params(best_params)

#     # Log best model explicitly
#     mlflow.sklearn.log_model(
#         sk_model=best_model,
#         artifact_path="sklearn-model",
#         input_example=X_train.head(5),
#         registered_model_name="rf_grid_search_best"
#     )

# # 4. Output results
# print(f"Best parameters: {best_params}")
# print(f"Best CV score: {best_score:.4f}")


# 5. Hyperparameter tuning with Randomized Search

### 5.1 Define parameter distribution

In [None]:
# param_dist = {'randomforestclassifier__n_estimators': randint(50,500),
#               'randomforestclassifier__min_samples_leaf': randint(1,10)}

In [None]:
# # 1) Set up RandomizedSearchCV
# rs = RandomizedSearchCV(
#     estimator=pipe,
#     param_distributions=param_dist,
#     n_iter=30,
#     cv=3,
#     scoring=make_scorer(f1_score, average="micro"),
#     random_state=123,
#     n_jobs=-1,
#     refit=True,
#     return_train_score=False,
# )

# # 3) MLflow experiment + run
# mlflow.set_experiment("rf-randomsearch")
# mlflow.sklearn.autolog(
#     log_models=False,            # avoid duplicate model; we log manually below
#     log_model_signatures=True,
#     log_input_examples=True
# )

# with mlflow.start_run(run_name="random_search_rf"):
#     rs.fit(X_train, y_train["damage_grade"].values.ravel())

#     best_model  = rs.best_estimator_
#     best_score  = float(rs.best_score_)
#     best_params = rs.best_params_

#     # Metrics
#     mlflow.log_metric("best_cv_f1_micro", best_score)
#     val_f1 = float(rs.score(X_val, y_val["damage_grade"].values.ravel()))
#     mlflow.log_metric("val_f1_micro", val_f1)

#     # Params — use a separate namespace to avoid collisions with autolog
#     mlflow.log_params({f"best__{k}": str(v) for k, v in best_params.items()})
#     mlflow.log_dict({k: str(v) for k, v in param_dist.items()}, "random_search_space.json")

#     # Full CV table as artifact
#     if hasattr(rs, "cv_results_"):
#         pd.DataFrame(rs.cv_results_).to_csv("cv_results_random_search.csv", index=False)
#         mlflow.log_artifact("cv_results_random_search.csv")

#     # Log the best model explicitly
#     mlflow.sklearn.log_model(
#         sk_model=best_model,
#         artifact_path="sklearn-model-best",
#         input_example=X_train.head(5)
#     )

# print(f"Best parameters: {best_params}")
# print(f"Best CV score: {best_score:.4f}")
# print(f"Validation f1_micro: {val_f1:.4f}")

# 6. Sequential Model-Based Optimization (SMBO)

### 6.1 Define search space

In [None]:
# search_space = {
#     'randomforestclassifier__n_estimators': (100, 1000),
#     'randomforestclassifier__max_depth': (1, 50),
#     'randomforestclassifier__min_samples_split': (2, 100),
#     'randomforestclassifier__min_samples_leaf': (1, 50),
#     'randomforestclassifier__criterion': ['gini', 'entropy'],
#     'randomforestclassifier__max_features': (20, 39)
#     }

search_space = {
    'randomforestclassifier__n_estimators': (10, 20),
    'randomforestclassifier__max_depth': (1, 10),
    'randomforestclassifier__min_samples_split': (2, 20),
    'randomforestclassifier__min_samples_leaf': (1, 10),
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'randomforestclassifier__max_features': (20, 39)
    }

In [None]:
mlflow.set_experiment("rf-bayessearch")

# ---- your search -----------------------------------------------------------
opt = BayesSearchCV(
    pipe,
    search_space,
    n_iter=10,
    cv=3,
    scoring=make_scorer(f1_score, average="micro"),
    random_state=123,
)

with mlflow.start_run(run_name="bayes_rf"):
    # Fit
    np.int = int
    opt.fit(X_train, y_train["damage_grade"].values.ravel())

    # Metrics
    best_cv = float(opt.best_score_)
    val_f1 = float(opt.score(X_val, y_val["damage_grade"].values.ravel()))
    mlflow.log_metric("best_cv_f1_micro", best_cv)
    mlflow.log_metric("val_f1_micro", val_f1)

    # Log search space for reproducibility
    mlflow.log_dict(search_space, "search_space.json")

    # Params
    mlflow.log_params(opt.best_params_)

    # ---- Full CV results as artifact ----
    if hasattr(opt, "cv_results_"):
        df = pd.DataFrame(opt.cv_results_)
        buffer = io.StringIO()
        df.to_csv(buffer, index=False)
        mlflow.log_text(buffer.getvalue(), artifact_file="cv_results_bayes.csv")

    # Log the best estimator as a clear, separate artifact
    best_est = opt.best_estimator_

    mlflow.sklearn.log_model(
        sk_model=best_est,
        artifact_path="sklearn-model-best",
        input_example=(X_train.iloc[:5] if hasattr(X_train, "iloc") else X_train[:5]),
        registered_model_name="rf_bayessearch"
    )

# ---- Print summary ----
print(f"Best CV score: {best_cv:.4f}")
print(f"Validation score: {val_f1:.4f}")
print("Best parameters:", opt.best_params_)


# 7. Plot the optimizer process

In [None]:
_ = plot_objective(opt.optimizer_results_[0],
                   dimensions=["n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features", "criterion"],)


# Save figure
fig_path = FIG_DIR / "rf_bayesian_optimization.png"
plt.savefig(fig_path, dpi=300)
plt.show()

# 8. Make predictions

In [None]:
val_preds = opt.predict(X_val)

# 9. Feature Importance

In [None]:
# Feature importance
importances = opt.best_estimator_.named_steps['randomforestclassifier'].feature_importances_
features_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort features by importance
features_df = features_df.sort_values('Importance', ascending=False)

# Create horizontal bar plot
plt.figure(figsize=(15, 20))
plt.barh(features_df['Feature'], features_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()  # show most important features at the top

# Save figure
fig_path = FIG_DIR / "rf_feature_importance.png"
plt.savefig(fig_path, dpi=300)
plt.show()


# 10. Confusion matrix

In [None]:
cm = confusion_matrix(y_val["damage_grade"].values.ravel(), val_preds)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=[1, 2, 3], yticklabels=[1, 2, 3])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion matrix Random Forest')

# Save figure
fig_path = FIG_DIR / "rf_confusion_matrix.png"
plt.savefig(fig_path, dpi=300)
plt.show()

# 11. Create submission

In [None]:
model_uri = "models:/rf_bayessearch/2"
model = mlflow.sklearn.load_model(model_uri)
test_values_path = "../data/raw/competition/test_values.csv"
output_path = "../models/submission_rf_test.csv"

predict.create_submission(model=model, test_values_path=test_values_path, output_path=output_path)