In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [33]:
dataset1 = pd.read_csv('../../data/dataset/feature_selection/data_encoded_10_features.csv')
dataset2 = pd.read_csv('../../data/dataset/feature_selection/data_encoded_20_features.csv')
dataset3 = pd.read_csv('../../data/dataset/feature_selection/data_encoded_41_features.csv')
dataset_re_1 = pd.read_csv('../../data/dataset/feature_selection/resample_encoded_10_features.csv')
dataset_re_2 = pd.read_csv('../../data/dataset/feature_selection/resample_encoded_20_features.csv')
dataset_re_3 = pd.read_csv('../../data/dataset/feature_selection/resample_encoded_41_features.csv')

In [34]:
X1 = dataset1.drop('HeartDisease_Yes', axis=1) 
y1 = dataset1['HeartDisease_Yes']
X2 = dataset2.drop('HeartDisease_Yes', axis=1)
y2 = dataset2['HeartDisease_Yes']
X3 = dataset3.drop('HeartDisease_Yes', axis=1)
y3 = dataset3['HeartDisease_Yes']

In [35]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=42)
X2_train, X2_test, y2_train,y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.3, random_state=42)

In [36]:
clf = DecisionTreeClassifier()

In [37]:
# Train and evaluate the classifier for each dataset
for X_train, X_test, y_train, y_test, dataset_name in zip(
    [X1_train, X2_train, X3_train],
    [X1_test, X2_test, X3_test],
    [y1_train, y2_train, y3_train],
    [y1_test, y2_test, y3_test],
    ['Dataset 1', 'Dataset 2', 'Dataset 3']
):
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)

    # Print the results
    print(f"Results for {dataset_name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  TP rate: {tpr:.4f}")
    print(f"  FP rate: {fpr:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  ROC AUC: {roc_auc:.4f}")
    print("-" * 30)

Results for Dataset 1:
  Accuracy: 0.9098
  TP rate: 0.0602
  FP rate: 0.0064
  Precision: 0.4828
  Recall: 0.0602
  F1-score: 0.1071
  ROC AUC: 0.5269
------------------------------
Results for Dataset 2:
  Accuracy: 0.9037
  TP rate: 0.1156
  FP rate: 0.0185
  Precision: 0.3810
  Recall: 0.1156
  F1-score: 0.1774
  ROC AUC: 0.5485
------------------------------
Results for Dataset 3:
  Accuracy: 0.8551
  TP rate: 0.2551
  FP rate: 0.0856
  Precision: 0.2273
  Recall: 0.2551
  F1-score: 0.2404
  ROC AUC: 0.5848
------------------------------


In [38]:
X4 = dataset1.drop('HeartDisease_Yes', axis=1) 
y4 = dataset1['HeartDisease_Yes']
X5 = dataset2.drop('HeartDisease_Yes', axis=1)
y5 = dataset2['HeartDisease_Yes']
X6 = dataset3.drop('HeartDisease_Yes', axis=1)
y6 = dataset3['HeartDisease_Yes']

In [39]:
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.3, random_state=42)
X5_train, X5_test, y5_train,y5_test = train_test_split(X5, y5, test_size=0.3, random_state=42)
X6_train, X6_test, y6_train, y6_test = train_test_split(X6, y6, test_size=0.3, random_state=42)

In [40]:
# Train and evaluate the classifier for each dataset
for X_train, X_test, y_train, y_test, dataset_name in zip(
    [X4_train, X5_train, X6_train],
    [X4_test, X5_test, X6_test],
    [y4_train, y5_train, y6_train],
    [y4_test, y5_test, y6_test],
    ['Dataset 4', 'Dataset 5', 'Dataset 6']
):
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)

    # Print the results
    print(f"Results for {dataset_name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  TP rate: {tpr:.4f}")
    print(f"  FP rate: {fpr:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  ROC AUC: {roc_auc:.4f}")
    print("-" * 30)

Results for Dataset 4:
  Accuracy: 0.9098
  TP rate: 0.0602
  FP rate: 0.0064
  Precision: 0.4828
  Recall: 0.0602
  F1-score: 0.1071
  ROC AUC: 0.5269
------------------------------
Results for Dataset 5:
  Accuracy: 0.9036
  TP rate: 0.1137
  FP rate: 0.0185
  Precision: 0.3783
  Recall: 0.1137
  F1-score: 0.1749
  ROC AUC: 0.5476
------------------------------
Results for Dataset 6:
  Accuracy: 0.8552
  TP rate: 0.2524
  FP rate: 0.0852
  Precision: 0.2262
  Recall: 0.2524
  F1-score: 0.2386
  ROC AUC: 0.5836
------------------------------
