In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import label_binarize
import shap
import ipywidgets as widgets
from IPython.display import display


def save_model(model, path):
    with open(path, 'wb') as f:
        pickle.dump(model, f)

def load_model(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def evaluate_models(models_paths, X_test, y_test):
    results = []
    plt.figure(figsize=(10, 8))
    
    for path in models_paths:
        name = path.split("/")[-1].split(".")[0]
        model = load_model(path)
        y_proba = model.predict_proba(X_test)[:, 1]

        best_threshold, best_recall = 0.5, 0
        thresholds = np.linspace(0.1, 0.9, 81)
        for t in thresholds:
            y_pred = (y_proba >= t).astype(int)
            rec = recall_score(y_test, y_pred)
            if rec > best_recall:
                best_recall = rec
                best_threshold = t

        y_pred_best = (y_proba >= best_threshold).astype(int)

        precision = precision_score(y_test, y_pred_best)
        recall = recall_score(y_test, y_pred_best)
        f1 = f1_score(y_test, y_pred_best)
        auc = roc_auc_score(y_test, y_proba)
        cm = confusion_matrix(y_test, y_pred_best)

        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.2f})")

        results.append({
            'model': name,
            'threshold': best_threshold,
            'recall': recall,
            'precision': precision,
            'f1': f1,
            'auc': auc,
            'confusion_matrix': cm,
            'model_obj': model,
            'y_proba': y_proba
        })

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()
    plt.grid()
    plt.show()

    df_results = pd.DataFrame([{k: v for k, v in r.items() if k not in ['model_obj', 'y_proba', 'confusion_matrix']} for r in results])
    display(df_results.sort_values('recall', ascending=False))

    for res in results:
        print("\n=================", res['model'], "=================")
        print(f"Threshold: {res['threshold']:.2f}, Recall: {res['recall']:.3f}, Precision: {res['precision']:.3f}, F1: {res['f1']:.3f}, AUC: {res['auc']:.3f}")
        print("Confusion matrix:")
        print(res['confusion_matrix'])

        try:
            explainer = shap.TreeExplainer(res['model_obj'])
            shap_values = explainer.shap_values(X_test)
            shap.summary_plot(shap_values, X_test, show=False)
            plt.title(f"SHAP Summary: {res['model']}")
            plt.show()
        except Exception as e:
            print(f"[!] SHAP not supported for {res['model']}:", e)

    def interactive_threshold(model_result):
        def update(thresh):
            y_pred = (model_result['y_proba'] >= thresh).astype(int)
            cm = confusion_matrix(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            print(f"\nThreshold: {thresh:.2f}")
            print("Confusion matrix:")
            print(cm)
            print(f"Recall: {recall:.3f}, Precision: {precision:.3f}, F1: {f1:.3f}")

        return widgets.interact(update, thresh=widgets.FloatSlider(min=0.1, max=0.9, step=0.01, value=model_result['threshold']))

    print("\n========= Interaktywny próg dla najlepszego modelu =========")
    best_model = max(results, key=lambda x: x['recall'])
    interactive_threshold(best_model)


  from pandas.core import (


In [None]:
df = pd.read_csv('learning_set.csv')
df.head()

# Zakładamy, że kolumna celu nazywa się:
target_column = 'Lung_Cancer_Diagnosis'

X = df.drop(columns=[target_column])
y = df[target_column]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,        # zachowujemy proporcje klas!
    random_state=42
)

evaluate_models(
    models_paths=[
        "models/logreg.pkl",
        "models/xgboost.pkl",
        "models/lightgbm.pkl",
        "models/catboost.pkl"
    ],
    X_test=X_test,
    y_test=y_test
)