In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    mean_squared_error,
)
import joblib


def main():
    filename = "cleaned_top_500.csv"
    data_path = os.path.join("..", "data", "02_interim", filename)

    data = pd.read_csv(data_path)

    data = data.dropna(subset=["pct_pos_total"])

    # klasyfikacja
    # >70% = dobra gra
    data["target_bin"] = (data["pct_pos_total"] > 70).astype(int)

    # x, y dla klasyfikacji
    x_class = data.drop(columns=["pct_pos_total", "target_bin"])
    y_class = data["target_bin"]

    # usunięcie ID / nazw jeśli są
    for col in ["appid", "name"]:
        if col in x_class.columns:
            x_class = x_class.drop(columns=[col])

    # tylko dane numeryczne
    x_class = x_class.select_dtypes(include=[np.number])

    # wypełnienie braków medianą
    x_class = x_class.fillna(x_class.median(numeric_only=True))

    # train / test
    xc_train, xc_test, yc_train, yc_test = train_test_split(
        x_class,
        y_class,
        test_size=0.2,
        random_state=42,
        stratify=y_class,
    )

    # model klasyfikacyjny
    clf = RandomForestClassifier(
        random_state=42,
        n_estimators=200,
        n_jobs=-1,
    )
    clf.fit(xc_train, yc_train)

    # predykcje
    yc_pred = clf.predict(xc_test)
    yc_proba = clf.predict_proba(xc_test)[:, 1]

    # metryki
    clf_results = {
        "Model": "RandomForestClassifier",
        "Accuracy": accuracy_score(yc_test, yc_pred),
        "Precision": precision_score(yc_test, yc_pred, zero_division=0),
        "Recall": recall_score(yc_test, yc_pred, zero_division=0),
        "F1": f1_score(yc_test, yc_pred, zero_division=0),
        "ROC-AUC": roc_auc_score(yc_test, yc_proba),
    }

    print("\n=== WYNIKI KLASYFIKACJI (target_bin) ===")
    for k, v in clf_results.items():
        if k == "Model":
            print(f"{k}: {v}")
        else:
            print(f"{k}: {v:.4f}")

    # regresja
    target = "pct_pos_total"
    y_reg = pd.to_numeric(data[target], errors="coerce")

    # x dla regresji - bez targetu i bez target_bin
    cols_to_drop = [target]
    if "target_bin" in data.columns:
        cols_to_drop.append("target_bin")

    x_reg = data.drop(columns=cols_to_drop)

    # usunięcie ID / nazw
    for col in ["appid", "name"]:
        if col in x_reg.columns:
            x_reg = x_reg.drop(columns=[col])

    # one-hot tylko dla kolumn kategorycznych
    cat_cols = x_reg.select_dtypes(include=["object", "category"]).columns
    if len(cat_cols) > 0:
        x_reg = pd.get_dummies(x_reg, columns=cat_cols, drop_first=True)

    mask = y_reg.notna()
    x_reg = x_reg.loc[mask]
    y_reg = y_reg.loc[mask]

    # wypełnienie braków w X medianą
    x_reg = x_reg.fillna(x_reg.median(numeric_only=True))

    # train / test
    xr_train, xr_test, yr_train, yr_test = train_test_split(
        x_reg, y_reg, test_size=0.2, random_state=42
    )

    # model regresyjny
    reg = RandomForestRegressor(
        random_state=42,
        n_estimators=200,
        n_jobs=-1,
    )
    reg.fit(xr_train, yr_train)

    # predykcje
    yr_pred = reg.predict(xr_test)

    # RMSE
    mse = mean_squared_error(yr_test, yr_pred)
    rmse = np.sqrt(mse)

    print("\n=== WYNIK REGRESJI (pct_pos_total) ===")
    print(f"RMSE: {rmse:.3f}")

    # zapis modeli
    model_dir = os.path.join("..", "data", "03_model")
    os.makedirs(model_dir, exist_ok=True)

    clf_artifact = {
        "model": clf,
        "features": xc_train.columns.tolist(),
    }
    clf_path = os.path.join(model_dir, "rf_classifier.pkl")
    joblib.dump(clf_artifact, clf_path)
    print(f"\nModel klasyfikacyjny zapisany do pliku: {clf_path}")

    reg_artifact = {
        "model": reg,
        "features": xr_train.columns.tolist(),
    }
    reg_path = os.path.join(model_dir, "rf_regressor.pkl")
    joblib.dump(reg_artifact, reg_path)
    print(f"Model regresyjny zapisany do pliku: {reg_path}")


if __name__ == "__main__":
    main()