# Classifiers and Advanced Features

In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [14]:
dataset_10 = pd.read_csv('../../data/dataset/processed_datasets/feature_selection/data_encoded_10_features.csv')
dataset_20 = pd.read_csv('../../data/dataset/processed_datasets/feature_selection/data_encoded_20_features.csv')
dataset_all = pd.read_csv('../../data/dataset/processed_datasets/feature_selection/data_encoded_full_dataset.csv')
df_under_10 = pd.read_csv('../../data/dataset/processed_datasets/feature_selection/resample_encoded_10_features.csv')
df_under_20 = pd.read_csv('../../data/dataset/processed_datasets/feature_selection/resample_encoded_20_features.csv')
df_under_all = pd.read_csv('../../data/dataset/processed_datasets/feature_selection/resample_encoded_full_dataset.csv')

## Train Test Splitting

In [15]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# Store the splits in dictionaries
train_sets = {}
test_sets = {}

for df, name in zip([dataset_10, dataset_20, dataset_all, df_under_10, df_under_20, df_under_all], 
                   ['10', '20', 'all', 'under_10', 'under_20', 'under_all']):
    for train_index, test_index in split.split(df, df["HeartDisease_Yes"]):
        train_sets[name] = df.loc[train_index]
        test_sets[name] = df.loc[test_index]

In [16]:
X_train_10 = train_sets['10'].drop("HeartDisease_Yes", axis=1)
y_train_10 = train_sets['10']["HeartDisease_Yes"] 
X_test_10 = test_sets['10'].drop("HeartDisease_Yes", axis=1)
y_test_10 = test_sets['10']["HeartDisease_Yes"] 

X_train_20 = train_sets['20'].drop("HeartDisease_Yes", axis=1)
y_train_20 = train_sets['20']["HeartDisease_Yes"] 
X_test_20 = test_sets['20'].drop("HeartDisease_Yes", axis=1)
y_test_20 = test_sets['20']["HeartDisease_Yes"] 

X_train_all = train_sets['all'].drop("HeartDisease_Yes", axis=1)
y_train_all = train_sets['all']["HeartDisease_Yes"] 
X_test_all = test_sets['all'].drop("HeartDisease_Yes", axis=1)
y_test_all = test_sets['all']["HeartDisease_Yes"] 

X_train_under_10 = train_sets['under_10'].drop("HeartDisease_Yes", axis=1)
y_train_under_10 = train_sets['under_10']["HeartDisease_Yes"] 
X_test_under_10 = test_sets['under_10'].drop("HeartDisease_Yes", axis=1)
y_test_under_10 = test_sets['under_10']["HeartDisease_Yes"] 

X_train_under_20 = train_sets['under_20'].drop("HeartDisease_Yes", axis=1)
y_train_under_20 = train_sets['under_20']["HeartDisease_Yes"] 
X_test_under_20 = test_sets['under_20'].drop("HeartDisease_Yes", axis=1)
y_test_under_20 = test_sets['under_20']["HeartDisease_Yes"] 

X_train_under_all = train_sets['under_all'].drop("HeartDisease_Yes", axis=1)
y_train_under_all = train_sets['under_all']["HeartDisease_Yes"] 
X_test_under_all = test_sets['under_all'].drop("HeartDisease_Yes", axis=1)
y_test_under_all = test_sets['under_all']["HeartDisease_Yes"] 

In [42]:
def income_cat_proportions(data):
    return data["HeartDisease_Yes"].value_counts() / len(data)

# Store the splits and comparison results in dictionaries
train_sets = {}
test_sets = {}
compare_props_dict = {}

for df, name in zip([dataset_10, dataset_20, dataset_all, df_under_10, df_under_20, df_under_all], 
                    ['10', '20', 'all', 'under_10', 'under_20', 'under_all']):

    # Stratified Split
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(df, df["HeartDisease_Yes"]):
        train_sets[name] = df.loc[train_index]
        strat_test_set = df.loc[test_index]

    # Random Split
    train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

    # Compare proportions
    compare_props = pd.DataFrame({
        "Overall": income_cat_proportions(df),
        "Stratified": income_cat_proportions(strat_test_set),
        "Random": income_cat_proportions(test_set),
    }).sort_index()
    compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
    compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

    compare_props_dict[name]= compare_props

# Print the comparison results for each dataset
for name, compare_props in compare_props_dict.items():
    print(f"Comparison for {name} dataset:")
    print(compare_props)
    print(" " * 30)

Comparison for 10 dataset:
                   Overall  Stratified    Random  Rand. %error  Strat. %error
HeartDisease_Yes                                                             
0.0               0.909647    0.909651  0.909684      0.004107       0.000463
1.0               0.090353    0.090349  0.090316     -0.041345      -0.004663
                              
Comparison for 20 dataset:
                   Overall  Stratified    Random  Rand. %error  Strat. %error
HeartDisease_Yes                                                             
0.0               0.909647    0.909651  0.909684      0.004107       0.000463
1.0               0.090353    0.090349  0.090316     -0.041345      -0.004663
                              
Comparison for all dataset:
                   Overall  Stratified    Random  Rand. %error  Strat. %error
HeartDisease_Yes                                                             
0.0               0.909647    0.909651  0.909684      0.004107       0.00046

## Logistic Regression

### Entire dataset sample

In [27]:
# Initialize logistic regression models
log_reg_10 = LogisticRegression(max_iter=1000, random_state=42)
log_reg_20 = LogisticRegression(max_iter=1000, random_state=42)
log_reg_all = LogisticRegression(max_iter=1000, random_state=42)

In [28]:
# Train the models
log_reg_10.fit(X_train_10, y_train_10)
log_reg_20.fit(X_train_20, y_train_20)
log_reg_all.fit(X_train_all, y_train_all)

In [32]:
# Predictions for each dataset
y_pred_10 = log_reg_10.predict(X_test_10)
y_pred_20 = log_reg_20.predict(X_test_20)
y_pred_all = log_reg_all.predict(X_test_all)

In [33]:
# Calculate performance metrics
metrics = {}
for name, y_test, y_pred, model, X_train, y_train in [
    ("10 Features", y_test_10, y_pred_10, log_reg_10, X_train_10, y_train_10),
    ("20 Features", y_test_20, y_pred_20, log_reg_20, X_train_20, y_train_20),
    ("37 Features", y_test_all, y_pred_all, log_reg_all, X_train_all, y_train_all)
]:
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Cross-validate for ROC and AUC
    y_scores = cross_val_predict(model, X_train, y_train, cv=3, method="predict_proba")[:, 1]
    fpr, tpr, _ = roc_curve(y_train, y_scores)
    auc = roc_auc_score(y_train, y_scores)
    
    # Store the results
    metrics[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "AUC": auc,
        "y_test": y_test,
        "y_pred": y_pred
    }
    metrics[name]["fpr"] = fpr
    metrics[name]["tpr"] = tpr

# Print results
for dataset, metric_values in metrics.items():
    print(f"\nResults for {dataset} Dataset:")
    for metric, value in metric_values.items():
        if metric in ["fpr", "tpr", "y_test", "y_pred"]:
            continue
        print(f"{metric}: {value:.2f}")


Results for 10 Features Dataset:
Accuracy: 0.91
Precision: 0.52
Recall: 0.09
F1-Score: 0.16
AUC: 0.79

Results for 20 Features Dataset:
Accuracy: 0.91
Precision: 0.53
Recall: 0.10
F1-Score: 0.17
AUC: 0.83

Results for 37 Features Dataset:
Accuracy: 0.91
Precision: 0.53
Recall: 0.11
F1-Score: 0.18
AUC: 0.84


### Undersampling

In [34]:
# Initialize logistic regression models
log_reg_undersampled_10 = LogisticRegression(max_iter=1000, random_state=42)
log_reg_undersampled_20 = LogisticRegression(max_iter=1000, random_state=42)
log_reg_undersampled_all = LogisticRegression(max_iter=1000, random_state=42)

In [35]:
# Train the models
log_reg_undersampled_10.fit(X_train_under_10, y_train_under_10)
log_reg_undersampled_20.fit(X_train_under_20, y_train_under_20)
log_reg_undersampled_all.fit(X_train_under_all, y_train_under_all)

In [36]:
# Predictions for each dataset
y_pred_undersampled_10 = log_reg_undersampled_10.predict(X_test_under_10)
y_pred_undersampled_20 = log_reg_undersampled_20.predict(X_test_under_20)
y_pred_undersampled_all = log_reg_undersampled_all.predict(X_test_under_all)

In [37]:
# Calculate performance metrics
metrics = {}
for name, y_test, y_pred, model, X_train, y_train in [
    ("10 Features", y_test_under_10, y_pred_undersampled_10, log_reg_undersampled_10, X_train_under_10, y_train_under_10),
    ("20 Features", y_test_under_20, y_pred_undersampled_20, log_reg_undersampled_20, X_train_under_20, y_train_under_20),
    ("37 Features", y_test_under_all, y_pred_undersampled_all, log_reg_undersampled_all, X_train_under_all, y_train_under_all)
]:
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Cross-validate for ROC and AUC
    y_scores = cross_val_predict(model, X_train, y_train, cv=3, method="predict_proba")[:, 1]
    fpr, tpr, _ = roc_curve(y_train, y_scores)
    auc = roc_auc_score(y_train, y_scores)
    
    # Store the results
    metrics[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "AUC": auc,
        "y_test": y_test,
        "y_pred": y_pred
    }
    metrics[name]["fpr"] = fpr
    metrics[name]["tpr"] = tpr

# Print results
for dataset, metric_values in metrics.items():
    print(f"\nResults for {dataset} Dataset (Undersampled):")
    for metric, value in metric_values.items():
        if metric in ["fpr", "tpr", "y_test", "y_pred"]:
            continue
        print(f"{metric}: {value:.2f}")


Results for 10 Features Dataset (Undersampled):
Accuracy: 0.71
Precision: 0.74
Recall: 0.66
F1-Score: 0.70
AUC: 0.79

Results for 20 Features Dataset (Undersampled):
Accuracy: 0.75
Precision: 0.75
Recall: 0.77
F1-Score: 0.76
AUC: 0.83

Results for 37 Features Dataset (Undersampled):
Accuracy: 0.76
Precision: 0.75
Recall: 0.78
F1-Score: 0.76
AUC: 0.84


## Random Forest

### Entire Dataset Sample

In [39]:
# Initialize separate RandomForestClassifier instances for each feature set
rf_clf_10 = RandomForestClassifier(random_state=42)
rf_clf_20 = RandomForestClassifier(random_state=42)
rf_clf_all = RandomForestClassifier(random_state=42)


In [40]:
# Train each classifier on the respective training set
rf_clf_10.fit(X_train_10, y_train_10)
rf_clf_20.fit(X_train_20, y_train_20)
rf_clf_all.fit(X_train_all, y_train_all)

In [41]:
# Predictions for each dataset
y_pred_10 = rf_clf_10.predict(X_test_10)
y_pred_20 = rf_clf_20.predict(X_test_20)
y_pred_all = rf_clf_all.predict(X_test_all)

In [None]:
# Calculate performance metrics
metrics = {}
for name, y_test, y_pred, rf_clf, X_train, y_train in [
    ("10 Features", y_test_10, y_pred_10, rf_clf_10, X_train_10, y_train_10),
    ("20 Features", y_test_20, y_pred_20, rf_clf_20, X_train_20, y_train_20),
    ("37 Features", y_test_all, y_pred_all, rf_clf_all, X_train_all, y_train_all)
]:
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Cross-validate for ROC and AUC
    y_scores = cross_val_predict( rf_clf, X_train, y_train, cv=3, method="predict_proba")[:, 1]
    fpr, tpr, _ = roc_curve(y_train, y_scores)
    auc = roc_auc_score(y_train, y_scores)
    
    # Store the results
    metrics[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "AUC": auc,
        "y_test": y_test,
        "y_pred": y_pred
    }
    metrics[name]["fpr"] = fpr
    metrics[name]["tpr"] = tpr

# Print results
for dataset, metric_values in metrics.items():
    print(f"\nResults for {dataset} Dataset:")
    for metric, value in metric_values.items():
        if metric in ["fpr", "tpr", "y_test", "y_pred"]:
            continue
        print(f"{metric}: {value:.2f}")

### Undersampling

## Decision Tree Classifier

### Entire Dataset Sample

In [17]:
clf = DecisionTreeClassifier()

In [18]:
# Train and evaluate the classifier for each dataset
for X_train, X_test, y_train, y_test, dataset_name in zip(
    [X_train_10, X_train_20, X_train_all],
    [X_test_10, X_test_20, X_test_all],
    [y_train_10, y_train_20, y_train_all],
    [y_test_10, y_test_20, y_test_all],
    ['10 features 1', '20 features', 'All features']
):
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)

    # Print the results
    print(f"Results for {dataset_name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  TP rate: {tpr:.4f}")
    print(f"  FP rate: {fpr:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  ROC AUC: {roc_auc:.4f}")
    print(" " * 30)

Results for 10 features 1:
  Accuracy: 0.9086
  TP rate: 0.0704
  FP rate: 0.0081
  Precision: 0.4621
  Recall: 0.0704
  F1-score: 0.1222
  ROC AUC: 0.5311
                              
Results for 20 features:
  Accuracy: 0.9028
  TP rate: 0.1256
  FP rate: 0.0200
  Precision: 0.3846
  Recall: 0.1256
  F1-score: 0.1894
  ROC AUC: 0.5528
                              
Results for All features:
  Accuracy: 0.8540
  TP rate: 0.2461
  FP rate: 0.0857
  Precision: 0.2220
  Recall: 0.2461
  F1-score: 0.2335
  ROC AUC: 0.5802
                              


### Undersampling

In [43]:
# Train and evaluate the classifier for each dataset
print("Undersampled Datasets")
for X_train, X_test, y_train, y_test, dataset_name in zip(
    [X_train_under_10, X_train_under_20, X_train_under_all],
    [X_test_under_10, X_test_under_20, X_test_under_all],
    [y_train_under_10, y_train_under_20, y_train_under_all],
    [y_test_under_10, y_test_under_20, y_test_under_all],
    ['Undersampled 10', 'UnderSampled 20', 'Undersampled all']
):
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)

    # Print the results
    print(f"Results for {dataset_name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  TP rate: {tpr:.4f}")
    print(f"  FP rate: {fpr:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  ROC AUC: {roc_auc:.4f}")
    print("" * 20)

Undersampled Datasets
Results for Undersampled 10:
  Accuracy: 0.7004
  TP rate: 0.6563
  FP rate: 0.2555
  Precision: 0.7198
  Recall: 0.6563
  F1-score: 0.6866
  ROC AUC: 0.7004

Results for UnderSampled 20:
  Accuracy: 0.7109
  TP rate: 0.6827
  FP rate: 0.2610
  Precision: 0.7234
  Recall: 0.6827
  F1-score: 0.7025
  ROC AUC: 0.7109

Results for Undersampled all:
  Accuracy: 0.6655
  TP rate: 0.6638
  FP rate: 0.3328
  Precision: 0.6660
  Recall: 0.6638
  F1-score: 0.6649
  ROC AUC: 0.6655



## KNN Classifier

### Entire dataset sample

In [20]:
# Initialize the kNN classifier with k=5 and train it on each training set
knn_clf_10 = KNeighborsClassifier(n_neighbors=5)
knn_clf_20 = KNeighborsClassifier(n_neighbors=5)
knn_clf_all = KNeighborsClassifier(n_neighbors=5)

# Training for original datasets
knn_clf_10.fit(X_train_10, y_train_10)
knn_clf_20.fit(X_train_20, y_train_20)
knn_clf_all.fit(X_train_all, y_train_all)

In [21]:
# Predictions for each dataset
y_pred_10 = knn_clf_10.predict(X_test_10)
y_pred_20 = knn_clf_20.predict(X_test_20)
y_pred_all = knn_clf_all.predict(X_test_all)

In [22]:
# Calculate accuracy, precision, recall, F1-Score for each dataset
accuracy_10 = accuracy_score(y_test_10, y_pred_10)
precision_10 = precision_score(y_test_10, y_pred_10)
recall_10 = recall_score(y_test_10, y_pred_10)
f1_10 = f1_score(y_test_10, y_pred_10)

accuracy_20 = accuracy_score(y_test_20, y_pred_20)
precision_20 = precision_score(y_test_20, y_pred_20)
recall_20 = recall_score(y_test_20, y_pred_20)
f1_20 = f1_score(y_test_20, y_pred_20)

accuracy_37 = accuracy_score(y_test_all, y_pred_all)
precision_37 = precision_score(y_test_all, y_pred_all)
recall_37 = recall_score(y_test_all, y_pred_all)
f1_37 = f1_score(y_test_all, y_pred_all)

In [23]:
# Cross-validation predictions for ROC and AUC calculation
y_scores_10 = cross_val_predict(knn_clf_10, X_train_10, y_train_10, cv=3, method="predict_proba")[:, 1]
fpr_10, tpr_10, _ = roc_curve(y_train_10, y_scores_10)
roc_auc_10 = roc_auc_score(y_train_10, y_scores_10)

y_scores_20 = cross_val_predict(knn_clf_20, X_train_20, y_train_20, cv=3, method="predict_proba")[:, 1]
fpr_20, tpr_20, _ = roc_curve(y_train_20, y_scores_20)
roc_auc_20 = roc_auc_score(y_train_20, y_scores_20)

y_scores_37 = cross_val_predict(knn_clf_all, X_train_all, y_train_all, cv=3, method="predict_proba")[:, 1]
fpr_37, tpr_37, _ = roc_curve(y_train_all, y_scores_37)
roc_auc_37 = roc_auc_score(y_train_all, y_scores_37)

In [24]:
# Print the results for each dataset
print("Results for 10 Features Dataset:")
print(f"Accuracy: {accuracy_10:.2f}")
print(f"Precision: {precision_10:.2f}")
print(f"Recall: {recall_10:.2f}")
print(f"F1-Score: {f1_10:.2f}")
print(f"AUC: {roc_auc_10:.2f}")
print("\n")

print("Results for 20 Features Dataset:")
print(f"Accuracy: {accuracy_20:.2f}")
print(f"Precision: {precision_20:.2f}")
print(f"Recall: {recall_20:.2f}")
print(f"F1-Score: {f1_20:.2f}")
print(f"AUC: {roc_auc_20:.2f}")
print("\n")

print("Results for All Features Dataset:")
print(f"Accuracy: {accuracy_37:.2f}")
print(f"Precision: {precision_37:.2f}")
print(f"Recall: {recall_37:.2f}")
print(f"F1-Score: {f1_37:.2f}")
print(f"AUC: {roc_auc_37:.2f}")

Results for 10 Features Dataset:
Accuracy: 0.90
Precision: 0.36
Recall: 0.13
F1-Score: 0.19
AUC: 0.64


Results for 20 Features Dataset:
Accuracy: 0.90
Precision: 0.37
Recall: 0.14
F1-Score: 0.20
AUC: 0.70


Results for All Features Dataset:
Accuracy: 0.90
Precision: 0.31
Recall: 0.08
F1-Score: 0.13
AUC: 0.68


### Undersampling