# 0. Imports

In [None]:
# --- Notebook plotting ---
%matplotlib inline

# --- Standard library ---
from pathlib import Path

# --- Third-party libraries ---
import dagshub
import joblib
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import seaborn as sns

from imblearn.over_sampling import SMOTE
from skopt import BayesSearchCV
from skopt.plots import plot_objective, plot_histogram
from skopt.space import Real, Categorical, Integer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

from scipy.stats import randint

import io

# --- Project-specific imports ---
from src import dataset, preprocess
from src.config import RAW_DATA_DIR, INTERIM_DATA_DIR
from src.modeling import train, predict

# --- MLflow / DagsHub init ---
dagshub.init(repo_owner="joscha0610", repo_name="earthquake-damage-ml", mlflow=True)
from mlflow.tracking import MlflowClient

# --- Figure output directory ---
FIG_DIR = Path("../reports/figures")
FIG_DIR.mkdir(parents=True, exist_ok=True)


# 1. Load Data

In [None]:
# Competition (train/test/labels)
X, y, test = dataset.load_competition_raw(RAW_DATA_DIR)

# 2. Preprocess data

### 2.1 One-Hot-Encoding

In [None]:
X = preprocess.one_hot_encode(X)

### 2.2 Train Test Split
Split data in 80% trainings data and 20% validation data

In [None]:
X_train, X_val, y_train, y_val = preprocess.split_train_val(X, y, test_size=0.2, random_state=42)

### 2.3 Label encoding (1,2,3 -> 0,1,2)

In [None]:
y_train["damage_grade"] = y_train["damage_grade"] - 1
y_val["damage_grade"] = y_val["damage_grade"] - 1

# 3. Create Pipeline

In [None]:
pipe = train.build_xgboost_pipeline()

# 4. Sequential Model-Based Optimization (SMBO)

### 4.1 Define search space

In [None]:
# search_space = {
#     'xgbclassifier__learning_rate': Real(0.01, 0.3, prior='log-uniform'),  # Lernrate für die Aktualisierung der Gewichte nach jedem Baum
#     'xgbclassifier__n_estimators': Integer(100, 500),  # Anzahl der Bäume im Modell
#     'xgbclassifier__max_depth': Integer(3, 10),  # Maximale Tiefe jedes Baumes
#     }
search_space = {
    'xgbclassifier__learning_rate': Real(0.01, 0.3, prior='log-uniform'),  # Lernrate für die Aktualisierung der Gewichte nach jedem Baum
    'xgbclassifier__n_estimators': Integer(1, 10),  # Anzahl der Bäume im Modell
    'xgbclassifier__max_depth': Integer(3, 5),  # Maximale Tiefe jedes Baumes
    }

### 4.2 Tune parameters

In [None]:
mlflow.set_experiment("xgb-bayessearch")

# ---- Bayesian hyperparameter search ---------------------------------------
opt = BayesSearchCV(
    estimator=pipe,
    search_spaces=search_space,
    n_iter=32,
    cv=3,
    scoring=make_scorer(f1_score, average="micro"),
    random_state=123
)

with mlflow.start_run(run_name="bayes_xgb"):
    # Fit
    np.int = int  # workaround for older skopt behavior
    opt.fit(X_train, y_train["damage_grade"].values.ravel())

    # ---- Metrics ----
    best_cv = float(opt.best_score_)
    val_f1 = float(opt.score(X_val, y_val["damage_grade"].values.ravel()))
    mlflow.log_metric("best_cv_f1_micro", best_cv)
    mlflow.log_metric("val_f1_micro", test_f1)

    # Log search space for reproducibility
    mlflow.log_dict(search_space, "search_space.json")

    # ---- Best parameters ----
    mlflow.log_params(opt.best_params_)

    # ---- Full CV results as artifact ----
    if hasattr(opt, "cv_results_"):
        df = pd.DataFrame(opt.cv_results_)
        buffer = io.StringIO()
        df.to_csv(buffer, index=False)
        mlflow.log_text(buffer.getvalue(), artifact_file="cv_results_bayes.csv")

    # ---- Log best model explicitly ----
    best_est = opt.best_estimator_

    mlflow.sklearn.log_model(
        sk_model=best_est,
        artifact_path="sklearn-model-best",
        input_example=(X_train.iloc[:5] if hasattr(X_train, "iloc") else X_train[:5]),
        registered_model_name="xgb_bayessearch"
    )

# ---- Print summary ----
print(f"Best CV score: {best_cv:.4f}")
print(f"Validation score: {val_f1:.4f}")
print("Best parameters:", opt.best_params_)

# 5. Plot the optimizer process

In [None]:
from skopt.plots import plot_objective
_ = plot_objective(opt.optimizer_results_[0],
                   dimensions=["learning_rate", "n_estimators", "max_depth"])

# Save figure
fig_path = FIG_DIR / "xgb_bayesian_optimization.png"
plt.savefig(fig_path, dpi=300)
plt.show()

# 6. Make predictions

In [None]:
val_preds = opt.predict(X_val)

# 7. Feature Importance

In [None]:
# Feature importance
importances = opt.best_estimator_.named_steps['xgbclassifier'].feature_importances_
features_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort features by importance
features_df = features_df.sort_values('Importance', ascending=False)

# Create horizontal bar plot
plt.figure(figsize=(15, 20))
plt.barh(features_df['Feature'], features_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()  # show most important features at the top

# Save figure
fig_path = FIG_DIR / "xgb_feature_importance.png"
plt.savefig(fig_path, dpi=300)
plt.show()


# 8. Confusion matrix

In [None]:
cm = confusion_matrix(y_val["damage_grade"].values.ravel(), val_preds)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=[1, 2, 3], yticklabels=[1, 2, 3])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion matrix XGBoost')

# Save figure
fig_path = FIG_DIR / "xgb_confusion_matrix.png"
plt.savefig(fig_path, dpi=300)
plt.show()

# 9. Create submission

In [None]:
model_uri = "models:/xgb_bayessearch/1"
model = mlflow.sklearn.load_model(model_uri)
test_values_path = "../data/raw/competition/test_values.csv"
output_path = "../models/submission_xgb_test.csv"

predict.create_submission(model=model, test_values_path=test_values_path, output_path=output_path)