In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures,OneHotEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score

#from imblearn.over_sampling import SMOTE


from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier

import shap
from sklearn.feature_selection import RFE
import os


# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)



# Load the data
data = pd.read_csv('heart_failure_clinical_records_dataset.csv')

# Improved EDA
def improved_eda(data):
    print("Dataset Information:")
    print(data.info())
    
    print("\nMissing Values:")
    print(data.isnull().sum())
    
    print("\nSummary Statistics:")
    print(data.describe())
    
    print("\nClass Distribution:")
    print(data['DEATH_EVENT'].value_counts(normalize=True))
    
    # Correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()
    
    # Distribution plots for numerical features
    num_features = data.select_dtypes(include=[np.number]).columns
    n_features = len(num_features)
    fig, axes = plt.subplots(n_features // 3 + 1, 3, figsize=(20, 5 * (n_features // 3 + 1)))
    axes = axes.flatten()
    
    for i, col in enumerate(num_features):
        sns.histplot(data=data, x=col, hue='DEATH_EVENT', kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
    
    plt.tight_layout()
    plt.show()
    
    # Box plots for numerical features
    fig, axes = plt.subplots(n_features // 3 + 1, 3, figsize=(20, 5 * (n_features // 3 + 1)))
    axes = axes.flatten()
    
    for i, col in enumerate(num_features):
        sns.boxplot(data=data, x='DEATH_EVENT', y=col, ax=axes[i])
        axes[i].set_title(f'Box Plot of {col} by DEATH_EVENT')
    
    plt.tight_layout()
    plt.show()
    
    # Check for outliers using IQR method
    def detect_outliers(df, column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        return outliers
    
    print("\nOutliers Detection:")
    for col in num_features:
        outliers = detect_outliers(data, col)
        print(f"Outliers in {col}: {len(outliers)}")

improved_eda(data)


# Advanced EDA
def plot_feature_distributions(data):
    n_features = len(data.columns)
    fig, axes = plt.subplots(n_features // 3 + 1, 3, figsize=(20, 5 * (n_features // 3 + 1)))
    axes = axes.flatten()
    
    for i, col in enumerate(data.columns):
        sns.histplot(data=data, x=col, hue='DEATH_EVENT', kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
    
    plt.tight_layout()
    plt.show()

#plot_feature_distributions(data)
    
# Outlier detection and handling function
def detect_and_handle_outliers(data, columns, method='iqr'):
    for col in columns:
        if method == 'iqr':
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
        elif method == 'zscore':
            z_scores = np.abs(stats.zscore(data[col]))
            lower_bound = data[col].mean() - 3 * data[col].std()
            upper_bound = data[col].mean() + 3 * data[col].std()
        
        print(f"Outliers in {col}:")
        print(data[(data[col] < lower_bound) | (data[col] > upper_bound)][col])
    
    return data

detect_and_handle_outliers(data, data.select_dtypes(include=[np.number]).columns, method='iqr')

def one_hot_encoding(df,columns):
        for column in columns:
            df = pd.concat([df,pd.get_dummies(df[column],prefix=column)],axis=1)
            df.drop(column,axis=1,inplace=True)
        return df

def feature_engineering(data):
    # Add age_group categorical feature
    data['age_group'] = pd.cut(data['age'], bins=[0, 30, 45, 60, 75, 100], labels=['Young', 'Middle-aged', 'Senior', 'Elderly', 'Very Elderly'])

    data = one_hot_encoding(data,['age_group'])

    # Add anemia and diabetes interaction feature
    data['anemia_diabetes_interaction'] = data['anaemia'] * data['diabetes']
    
    # If you are high blood pressure and/or smoke diabetes and senior or elder, you are at higher risk of heart failure. Convert this to a binary feature (1 or 0)
    data['risk_factor'] = ((data['high_blood_pressure'] == 1) | (data['smoking'] == 1)) & ((data['diabetes'] == 1)) 
    data['risk_factor'] = data['risk_factor'].astype(int)

    return data


# Handle Class Imbalance using SMOTE
def handle_class_imbalance(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

# Feature Importance Analysis
def compute_feature_importance(name,best_model_clf,model,X):
    if name in ["randomforest","gradientboosting","adaboost","extratrees"]:
        #analyze_feature_importance(best_model_clf_, X_train)    
        importances = best_model_clf.named_steps[model[0][0]].feature_importances_
    
    if name in ["logistic"]:
        #analyze_feature_importance(best_model_clf_, X_train)
        importances = best_model_clf.named_steps[model[0][0]].coef_
    
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': importances})
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    feature_importance = feature_importance[feature_importance['importance'] > 0]

    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title('Feature Importance for ' + name)
    plt.show()

# Model Interpretability using SHAP
def interpret_model_with_shap(best_model_clf, X_train,X_test):

    # explain all the predictions in the test set
    explainer = shap.KernelExplainer(best_model_clf.predict_proba, X_train[:30])
    shap_values = explainer.shap_values(X_test[:30])
    shap.force_plot(explainer.expected_value[0], shap_values[..., 0], X_test[:30])

# Feature Selection using Recursive Feature Elimination
def perform_feature_selection(X, y, model, n_features_to_select=5):
    rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)
    rfe = rfe.fit(X, y)
    return X.columns[rfe.support_]

# Function to evaluate model
def evaluate_model(y_true, y_pred, y_prob):

    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(f"\nROC AUC Score: {roc_auc_score(y_true, y_prob)}")
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    print(f"Average Precision Score: {average_precision_score(y_true, y_prob)}")

    # ROC-AUC Score
    roc_auc = roc_auc_score(y_test, y_prob)
    print(f"ROC-AUC Score: {roc_auc:.4f}")

    # Plot ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()
    
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, marker='.')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()


# Separate features and target
X = data.drop(['DEATH_EVENT'], axis=1)
y = data['DEATH_EVENT']

# Feature Engineering
X = feature_engineering(X)

display(X.head())


search_space_svc = [{'svc__kernel': ['linear', 'poly'],
                    'svc__gamma': ['scale'],
                    'svc__C': [0.1, 1]}]

search_space_lr = [{'logisticregression__C': [0.1, 1],
                    'logisticregression__penalty': ['l1', 'l2']}]

search_space_ridge = [{'ridge__alpha': [0.1, 1]}]

search_space_decisiontree = [{'decisiontreeclassifier__max_depth': [2, 4, 6]}]
search_space_randomforest = [{'randomforestclassifier__n_estimators': [10,50,100,200],
                                'randomforestclassifier__max_features': [1,5,10]}]

search_space_gradientboosting = [{'gradientboostingclassifier__n_estimators': [10,25,50],
                                'gradientboostingclassifier__max_features': [1, 2, 5,10]}]

search_space_adaboost = [{'adaboostclassifier__n_estimators': [10,25,50]}]

search_space_bagging = [{'baggingclassifier__n_estimators': [10,25,50]}]

search_space_extratrees = [{'extratreesclassifier__n_estimators': [10,25,50]}]

search_space_knn = [{'kneighborsclassifier__n_neighbors': [5, 10, 15,25]}]


dict_models = {
    "svc": [('svc', SVC(probability=True)),
            search_space_svc,
            True],

    "logistic": [('logisticregression', LogisticRegression()),
                    search_space_lr,
                    True],
    
    "decisiontree": [('decisiontreeclassifier', DecisionTreeClassifier()),
                     search_space_decisiontree,
                     False],
    "randomforest": [('randomforestclassifier', RandomForestClassifier()),
                     search_space_randomforest,
                     False],
    "gradientboosting": [('gradientboostingclassifier', GradientBoostingClassifier()),
                         search_space_gradientboosting,
                         False],
    "adaboost": [('adaboostclassifier', AdaBoostClassifier()),
                 search_space_adaboost,
                 False],
    "bagging": [('baggingclassifier', BaggingClassifier()),
                search_space_bagging, 
                False],
    "extratrees": [('extratreesclassifier', ExtraTreesClassifier()),
                   search_space_extratrees, 
                   False],
    "knn": [('kneighborsclassifier', KNeighborsClassifier()),
            search_space_knn,
            True],
}

polynomial = False
subset = False
subset_features = X.columns


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance
#X_train, y_train = handle_class_imbalance(X_train, y_train)

results = {}
already_scaled = False
for name, model in dict_models.items():
    print(f"\nTraining {name}...")

    if model[2] and not already_scaled: # this means the model requires scaling
       
        standard_scaler = StandardScaler()
        if polynomial:
            X_train = X_train.to_numpy()

            poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=True)

            binary_columns = [col for col in X_train.columns if len(X_train[col].unique()) == 2]
            #use sets to get the difference between the two lists
            numeric_features = list(set(X_train.columns) - set(binary_columns_subset))

            X_train_poly = poly.fit_transform(X_train[[numeric_features]])
            X_test_poly = poly.fit_transform(X_test[[numeric_features]])

            X_train_scaled = standard_scaler.fit_transform(X_train_poly)

            # Apply the scaler to the test data
            X_test_scaled = standard_scaler.transform(X_test_poly)
            X_test = np.concatenate((X_test_scaled, X_test[binary_columns]), axis=1)

            # Append the binary columns to the normalized data
            X_train = np.concatenate((X_train_scaled, X_train[categorical_features]), axis=1)
            X_train = pd.DataFrame(X_train)

        else:
            if subset:
                #get the subset of features
                X_train = X_train[[subset_features]]
                X_test = X_test[[subset_features]] 
                
            #check which columns are binary
            binary_columns_subset = [col for col in X_train.columns if len(X_train[col].unique()) == 2]
            #use sets to get the difference between the two lists
            numeric_features_subset = list(set(X_train.columns) - set(binary_columns_subset))

            #print(numeric_features_subset)
            #print(binary_columns_subset)

            X_train_scaled = standard_scaler.fit_transform(X_train[numeric_features_subset])
            X_train = np.concatenate((X_train_scaled, X_train[binary_columns_subset]), axis=1)
            X_train = pd.DataFrame(X_train, columns=numeric_features_subset+binary_columns_subset)

            X_test_scaled = standard_scaler.transform(X_test[numeric_features_subset])
            X_test = np.concatenate((X_test_scaled, X_test[binary_columns_subset]), axis=1)

            already_scaled = True


    pipe = Pipeline([model[0]])

    grid_search = GridSearchCV(pipe, model[1], cv=5, verbose=0,scoring="roc_auc",return_train_score=True,n_jobs=-1)
    
    # Fit grid search
    best_model_clf_ = grid_search.fit(X_train.values, y_train.values)

    # visualize the best hyperparameters without dictionary format 

    print("Best hyperparameters %s" % (best_model_clf_.best_params_))

    # get the best estimator
    best_model_clf = best_model_clf_.best_estimator_

    # Compute feature importances
    #compute_feature_importances(name,X_train,best_model_clf)
    if name in ["logisticregression","randomforest","gradientboosting","adaboost","extratrees"]:
        compute_feature_importance(name,best_model_clf,model,X_train)


    # Evaluate on test set
    y_pred = best_model_clf.predict(X_test)
    y_prob = best_model_clf.predict_proba(X_test)[:, 1]
    
    evaluate_model(y_test, y_pred, y_prob)

    # Interpret model using SHAP
    #if name in ["randomforest","gradientboosting","adaboost","extratrees"]:
        #interpret_model_with_shap(best_model_clf, X_train,X_test)
    
    
    results[name] = {
        'model': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'cv_score': grid_search.best_score_,
        'test_score': roc_auc_score(y_test, y_prob)
    }

    #Convert the results to a dataframe
    results_df = pd.DataFrame(results)
    
display(results_df)
# Compare models
cv_scores = [result['cv_score'] for result in results.values()]
test_scores = [result['test_score'] for result in results.values()]


# Feature importance analysis
best_model = max(results, key=lambda x: results[x]['test_score'])
best_model_cv = max(results, key=lambda x: results[x]['cv_score'])
print(f"Best Model cv: {best_model_cv}")
print(f"\nBest Model: {best_model}")

In [None]:
# Compute feature importance for the best model
compute_feature_importance(best_model, results[best_model]['model'],X_train)
