In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report


# Caricamento del dataset
df = pd.read_csv("Heart_Attack_Prediction.csv")

# Mappatura della colonna 'Gender'
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Encoding di eventuali altre colonne categoriche
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col != 'Heart_Attack_Risk':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Definizione dei cluster di feature
lifestyle_features = ['Smoking', 'Alcohol_Consumption', 'Physical_Activity', 'Diet_Score', 'Stress_Level']
clinical_features = ['Cholesterol_Level', 'LDL_Level', 'HDL_Level', 'Systolic_BP', 'Diastolic_BP']

# Funzione per addestrare e valutare i modelli su un sottoinsieme di feature
def train_and_evaluate(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(),
        "SVM": SVC(),
        "KNN": KNeighborsClassifier(),
        "Gradient Boosting": GradientBoostingClassifier()
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1 Score": f1_score(y_test, y_pred)
        }
    return pd.DataFrame(results).T

# Target
y = df['Heart_Attack_Risk']

# Addestramento e valutazione per cluster Lifestyle
X_lifestyle = df[lifestyle_features]
results_lifestyle = train_and_evaluate(X_lifestyle, y)

# Addestramento e valutazione per cluster Clinical
X_clinical = df[clinical_features]
results_clinical = train_and_evaluate(X_clinical, y)

# Stampa dei risultati
print("Risultati per il cluster Lifestyle:")
print(results_lifestyle)

print("\nRisultati per il cluster Clinical:")
print(results_clinical)




Risultati per il cluster Lifestyle:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.526805   0.501650  0.571859  0.534459
Random Forest        0.583274   0.550842  0.664409  0.602319
SVM                  0.500715   0.484038  0.775771  0.596126
KNN                  0.561115   0.533870  0.598947  0.564539
Gradient Boosting    0.558971   0.528460  0.663657  0.588392

Risultati per il cluster Clinical:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.495711   0.477146  0.644093  0.548191
Random Forest        0.725518   0.695743  0.750188  0.721941
SVM                  0.494996   0.478462  0.702032  0.569076
KNN                  0.654039   0.601920  0.802107  0.687742
Gradient Boosting    0.552538   0.521096  0.715576  0.603044


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Caricamento del dataset
df = pd.read_csv("Heart_Attack_Prediction.csv")

# Mappatura della colonna 'Gender'
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Encoding di eventuali altre colonne categoriche
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col != 'Heart_Attack_Risk':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Definizione dei cluster di feature
lifestyle_features = ['Smoking', 'Alcohol_Consumption', 'Physical_Activity', 'Diet_Score', 'Stress_Level']
clinical_features = ['Cholesterol_Level', 'LDL_Level', 'HDL_Level', 'Systolic_BP', 'Diastolic_BP']

best_params = {
    'n_estimators': 300,
    'max_depth': 25,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'random_state': 42
}

# Funzione per addestrare e stampare classification_report
def train_and_report(X, y, cluster_name):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(**best_params),
        "SVM": SVC(),
        "KNN": KNeighborsClassifier(),
        "Gradient Boosting": GradientBoostingClassifier()
    }

    print(f"\nClassification Reports for {cluster_name} Cluster:\n")
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"--- {name} ---")
        print(classification_report(y_test, y_pred))

# Target
y = df['Heart_Attack_Risk']

# Addestramento e stampa report per cluster Lifestyle
X_lifestyle = df[lifestyle_features]
train_and_report(X_lifestyle, y, "Lifestyle")

# Addestramento e stampa report per cluster Clinical
X_clinical = df[clinical_features]
train_and_report(X_clinical, y, "Clinical")




Classification Reports for Lifestyle Cluster:

--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.56      0.49      0.52      1469
           1       0.50      0.57      0.53      1329

    accuracy                           0.53      2798
   macro avg       0.53      0.53      0.53      2798
weighted avg       0.53      0.53      0.53      2798

--- Random Forest ---
              precision    recall  f1-score   support

           0       0.63      0.51      0.56      1469
           1       0.55      0.66      0.60      1329

    accuracy                           0.58      2798
   macro avg       0.59      0.59      0.58      2798
weighted avg       0.59      0.58      0.58      2798

--- SVM ---
              precision    recall  f1-score   support

           0       0.55      0.25      0.35      1469
           1       0.48      0.78      0.60      1329

    accuracy                           0.50      2798
   macro avg      

OTTIMIZZATORE BAYESIANO


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from skopt import BayesSearchCV

# Caricamento del dataset
df = pd.read_csv("Heart_Attack_Prediction.csv")

# Mappatura della colonna 'Gender'
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Encoding di eventuali altre colonne categoriche
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col != 'Heart_Attack_Risk':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Selezione delle feature Clinical
clinical_features = ['Cholesterol_Level', 'LDL_Level', 'HDL_Level', 'Systolic_BP', 'Diastolic_BP']
X = df[clinical_features]
y = df['Heart_Attack_Risk']

# Bilanciamento con SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Divisione in train/test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Spazio di ricerca per BayesSearchCV
search_space = {
    'n_estimators': (50, 300),
    'max_depth': (3, 30),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 10)
}

# Ottimizzazione bayesiana
opt = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    search_spaces=search_space,
    n_iter=32,
    cv=3,
    scoring='f1',
    random_state=42,
    n_jobs=-1
)

# Addestramento
opt.fit(X_train, y_train)

# Valutazione
y_pred = opt.predict(X_test)
print("Best Parameters Found:", opt.best_params_)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))


Best Parameters Found: OrderedDict({'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300})

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.74      0.68      0.71      1469
           1       0.67      0.74      0.71      1329

    accuracy                           0.71      2798
   macro avg       0.71      0.71      0.71      2798
weighted avg       0.71      0.71      0.71      2798

