In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from scipy.special import expit
from plotly import express as ex
import plotly.io as pio
import os
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier, Perceptron
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from tabpfn import TabPFNClassifier
from sklearn.metrics import make_scorer, balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from imblearn.metrics import specificity_score
from sklearn.model_selection import cross_val_score, cross_validate

## Using Boruta

In [None]:
X_train = pd.read_csv('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/X_train_Boruta_REINT30.csv')
X_test = pd.read_csv('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/X_test_Boruta_REINT30.csv')

X_train.dropna(axis=1, how="all", inplace=True)
X_test.dropna(axis=1, how="all", inplace=True)


y_train = np.load('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/y_train_Boruta_REINT30.npy')
y_test = np.load('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/y_test_Boruta_REINT30.npy')

In [None]:
models = {
    "RandomForestClassifier": RandomForestClassifier(),
    "ExtraTreesClassifier": ExtraTreesClassifier(),
    "LGBMClassifier": LGBMClassifier(force_col_wise=True, verbosity=-1),
    "XGBClassifier": XGBClassifier(eval_metric='logloss'),
    "BaggingClassifier": BaggingClassifier(),
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
    "NuSVC": NuSVC(probability=True, max_iter=10000),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RidgeClassifier": RidgeClassifier(max_iter=1000),
    "SVC": SVC(probability=True, max_iter=10000),
    "LinearSVC": LinearSVC(max_iter=10000),
    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'),
    "RidgeClassifierCV": RidgeClassifierCV(),
    "CalibratedClassifierCV": CalibratedClassifierCV(cv=3),
    "PassiveAggressiveClassifier": PassiveAggressiveClassifier(max_iter=1000),
    "LabelSpreading": LabelSpreading(gamma=0.1, max_iter=1000),
    "LabelPropagation": LabelPropagation(gamma=0.1, max_iter=1000),
    "BernoulliNB": BernoulliNB(),
    "GaussianNB": GaussianNB(),
    "SGDClassifier": SGDClassifier(max_iter=1000),
    "Perceptron": Perceptron(max_iter=1000),
    "NearestCentroid": NearestCentroid(),
    "ExtraTreeClassifier": ExtraTreeClassifier(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DummyClassifier": DummyClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
}

Cross Validation com todos os dados de treino

In [None]:
results = []

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'roc_auc': 'roc_auc'
}

#models_thresholds = {}

for name, model in models.items():
    cv = cross_validate(
        model,
        X_train,
        y_train,
        cv=7,
        scoring=scoring,
        return_train_score=False
    )
    
    results.append({
        'Model': name,
        'Accuracy Mean': cv['test_accuracy'].mean(),
        'Accuracy Std':  cv['test_accuracy'].std(),
        'Precision Mean': cv['test_precision'].mean(),
        'Precision Std':  cv['test_precision'].std(),
        'Recall Mean': cv['test_recall'].mean(),
        'Recall Std':  cv['test_recall'].std(),
        'F1 Mean': cv['test_f1'].mean(),
        'F1 Std':  cv['test_f1'].std(),
        'ROC AUC Mean': cv['test_roc_auc'].mean(),
        'ROC AUC Std':  cv['test_roc_auc'].std()
    })

cross_val = pd.DataFrame(results)
cross_val = cross_val.round(2)
cross_val = cross_val.sort_values(by="F1 Mean", ascending=False)

In [None]:
cross_val.to_excel('Predictions_CrossVal_Boruta_REINT.xlsx')

In [None]:
results = []
trained_models = {}

for name, model in models.items():
    
    model.fit(X_train, y_train)

    trained_models[name] = model  

    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_scores = model.decision_function(X_test)
        y_prob = expit(y_scores)
    else:
        y_prob = None
        
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0),
        "ROC AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else None,
        "Specificity": specificity_score(y_test, y_pred)
    }

    results.append(metrics)

df_results_test = pd.DataFrame(results).round(2)
df_results_sorted = df_results_test.sort_values(by="F1 Score", ascending=False)

top5_models = df_results_sorted["Model"].head(5).tolist()

for name in top5_models:
    model = trained_models[name]
    with open(f"{name}_boruta_reint.pkl", "wb") as f:
        pickle.dump(model, f)

print("✅ Modelos salvos:", top5_models)

df_results_sorted

In [None]:
df_results_test.to_excel('Predictions_test_Boruta_REINT.xlsx')

In [None]:
fig = ex.scatter(df_results_test, x='ROC AUC', y='F1 Score', color='Model', title="Análise Comparativa Boruta — Reinternação: F1 score versus ROC AUC score")

fig.update_layout(
    width=1200,
    height=800,
    legend=dict(
        orientation="h",      # legenda horizontal
        yanchor="bottom",
        y=-1,               # coloca embaixo
        xanchor="center",
        x=0.5,
        traceorder="normal",
    )
)

## Using all features

In [None]:
X_train = pd.read_csv('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/X_train_REINT30.csv')
X_test = pd.read_csv('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/X_test_REINT30.csv')

y_train = np.load('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/y_train_REINT30.npy')
y_test = np.load('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/y_test_REINT30.npy')

In [None]:
results = []

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'roc_auc': 'roc_auc'
}

#models_thresholds = {}

for name, model in models.items():
    cv = cross_validate(
        model,
        X_train,
        y_train,
        cv=7,
        scoring=scoring,
        return_train_score=False
    )
    
    results.append({
        'Model': name,
        'Accuracy Mean': cv['test_accuracy'].mean(),
        'Accuracy Std':  cv['test_accuracy'].std(),
        'Precision Mean': cv['test_precision'].mean(),
        'Precision Std':  cv['test_precision'].std(),
        'Recall Mean': cv['test_recall'].mean(),
        'Recall Std':  cv['test_recall'].std(),
        'F1 Mean': cv['test_f1'].mean(),
        'F1 Std':  cv['test_f1'].std(),
        'ROC AUC Mean': cv['test_roc_auc'].mean(),
        'ROC AUC Std':  cv['test_roc_auc'].std()
    })

cross_val = pd.DataFrame(results)
cross_val = cross_val.round(2)
cross_val = cross_val.sort_values(by="F1 Mean", ascending=False)

In [None]:
cross_val.to_excel('Predictions_CrossVal_allFeatures_REINT.xlsx')

In [None]:
results = []
trained_models = {}

for name, model in models.items():
    
    model.fit(X_train, y_train)

    trained_models[name] = model  

    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_scores = model.decision_function(X_test)
        y_prob = expit(y_scores)
    else:
        y_prob = None
        
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0),
        "ROC AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else None,
        "Specificity": specificity_score(y_test, y_pred)
    }

    results.append(metrics)

df_results_test = pd.DataFrame(results).round(2)
df_results_sorted = df_results_test.sort_values(by="F1 Score", ascending=False)

top5_models = df_results_sorted["Model"].head(5).tolist()

for name in top5_models:
    model = trained_models[name]
    with open(f"{name}_allfeatures_reint.pkl", "wb") as f:
        pickle.dump(model, f)

print("✅ Modelos salvos:", top5_models)

df_results_sorted

In [None]:
df_results_test.to_excel('Predictions_test_allFeatures_REINT.xlsx')

In [None]:
fig = ex.scatter(df_results_test, x='ROC AUC', y='F1 Score', color='Model', title="Análise Comparativa (todas as Features) — Reinternação: F1 score versus ROC AUC score")

fig.update_layout(
    width=1200,
    height=800,
    legend=dict(
        orientation="h",      # legenda horizontal
        yanchor="bottom",
        y=-1,               # coloca embaixo
        xanchor="center",
        x=0.5,
        traceorder="normal",
    )
)

## Using literature review features

In [None]:
X_train = pd.read_csv('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/X_train_liter_REINT30.csv')
X_test = pd.read_csv('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/X_test_liter_REINT30.csv')

y_train = np.load('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/y_train_liter_REINT30.npy')
y_test = np.load('/home/isadoracosenza/Documentos/ebserh-clinical-outcome/notebooks/PREPROCESSED_FILES/reinternacao/reint_first_int_pickles/y_test_liter_REINT30.npy')

Cross Validation com todos os dados de treino

In [None]:
results = []

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0),
    'roc_auc': 'roc_auc'
}

for name, model in models.items():
    cv = cross_validate(
        model,
        X_train,
        y_train,
        cv=7,
        scoring=scoring,
        return_train_score=False
    )
    
    results.append({
        'Model': name,
        'Accuracy Mean': cv['test_accuracy'].mean(),
        'Accuracy Std':  cv['test_accuracy'].std(),
        'Precision Mean': cv['test_precision'].mean(),
        'Precision Std':  cv['test_precision'].std(),
        'Recall Mean': cv['test_recall'].mean(),
        'Recall Std':  cv['test_recall'].std(),
        'F1 Mean': cv['test_f1'].mean(),
        'F1 Std':  cv['test_f1'].std(),
        'ROC AUC Mean': cv['test_roc_auc'].mean(),
        'ROC AUC Std':  cv['test_roc_auc'].std()
    })

cross_val = pd.DataFrame(results).round(2)
cross_val = cross_val.sort_values(by="F1 Mean", ascending=False)

In [None]:
cross_val.to_excel('Predictions_CrossVal_liter_REINT.xlsx')

In [None]:
results = []
trained_models = {}

for name, model in models.items():
    
    model.fit(X_train, y_train)

    trained_models[name] = model  

    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_scores = model.decision_function(X_test)
        y_prob = expit(y_scores)
    else:
        y_prob = None
        
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0),
        "ROC AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else None,
        "Specificity": specificity_score(y_test, y_pred)
    }

    results.append(metrics)

df_results_test = pd.DataFrame(results).round(2)
df_results_sorted = df_results_test.sort_values(by="F1 Score", ascending=False)

top5_models = df_results_sorted["Model"].head(5).tolist()

for name in top5_models:
    model = trained_models[name]
    with open(f"{name}_literature_reint.pkl", "wb") as f:
        pickle.dump(model, f)

print("✅ Modelos salvos:", top5_models)

df_results_sorted

In [None]:
df_results_test.to_excel('Predictions_test_liter_REINT.xlsx')

In [None]:
fig = ex.scatter(df_results_test, x='ROC AUC', y='F1 Score', color='Model', title="Análise Comparativa (literatura) — Reinternação: F1 score versus ROC AUC score")

fig.update_layout(
    width=1200,
    height=800,
    legend=dict(
        orientation="h",      # legenda horizontal
        yanchor="bottom",
        y=-1,               # coloca embaixo
        xanchor="center",
        x=0.5,
        traceorder="normal",
    )
)