In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import random
import matplotlib.cm as cm
import numpy.matlib
from sklearn.metrics import average_precision_score
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [90]:
# Read the data
training_data = pd.read_csv("heart_simplified_train.csv")
validation_data = pd.read_csv("heart_simplified_validation.csv")
test_data = pd.read_csv("heart_simplified_test.csv")

# Convert categorical features to numerical
training_data = pd.get_dummies(training_data, columns=["Sex"])
training_data = pd.get_dummies(training_data, columns=["ChestPainType"])
validation_data = pd.get_dummies(validation_data, columns=["Sex"])
validation_data = pd.get_dummies(validation_data, columns=["ChestPainType"])
test_data = pd.get_dummies(test_data, columns=["Sex"])
test_data = pd.get_dummies(test_data, columns=["ChestPainType"])

# Extract the features and labels
training_features = training_data.drop(columns=["HeartDisease"])
training_labels = training_data["HeartDisease"]
validation_features = validation_data.drop(columns=["HeartDisease"])
validation_labels = validation_data["HeartDisease"]
test_features = test_data.drop(columns=["HeartDisease"])
test_labels = test_data["HeartDisease"]

In [91]:
def randomForests(trainingFeatures, trainingLabels, criterion = "gini", max_features = "auto", max_depth = None):
    predictor = RandomForestClassifier(n_estimators=100, criterion=criterion, max_features=max_features, max_depth=max_depth)
    predictor.fit(trainingFeatures, trainingLabels)
    return predictor

def accuracy(predictor, validation_features, validation_labels):
    correct = 0
    count = 0
    probabilities = predictor.predict_proba(validation_features)
    chance_of_correct = []
    correct_map = []
    for idx, valid_label in enumerate(predictor.predict(validation_features)):
        count = count + 1
        chance_of_correct.append(probabilities[idx,valid_label])
        if(valid_label == validation_labels[idx]):
            correct = correct +1
            correct_map.append(1)
            continue
        correct_map.append(0)
    return correct / count, correct, np.mean(chance_of_correct)

predictor = randomForests(training_features, training_labels)
precision, correct, probability_mean = accuracy(predictor, validation_features, validation_labels)
print("Test accuracy: ", precision)

Test accuracy:  0.62


In [92]:
def optimalParams(training_features, training_labels, validation_features, validation_labels):
    criterions = ["gini", "entropy"]
    max_features = ["sqrt", "log2"]
    max_depths = [2, 5, 7, 10, 15]
    best_metric = ("", "", 0, 0, 0)

    for criterion in criterions:
        for max_feature in max_features:
            for max_depth in max_depths:
                predictor = randomForests(training_features, training_labels, criterion, max_feature, max_depth)
                precision, correct, probability_mean = accuracy(predictor, validation_features, validation_labels)

                best_criterion, best_max_feature, best_max_depth, best_correct, best_mean = best_metric
                
                if (probability_mean < best_mean):
                    continue
                if (probability_mean == best_mean and correct < best_correct):
                    continue
                best_metric = (criterion, max_feature, max_depth, correct, probability_mean)

                # d part:
                print(f"criterion = {criterion}; max_depth = {max_depth}; max_features = {max_feature}; accuracy on validation data = {precision}; number of correctly classified validation samples = {correct};" )
    return best_metric

optimal_params = optimalParams(training_features, training_labels, validation_features, validation_labels)
print("Optimal parameters:", optimal_params)

criterion = gini; max_depth = 2; max_features = sqrt; accuracy on validation data = 0.65; number of correctly classified validation samples = 65;
criterion = gini; max_depth = 5; max_features = sqrt; accuracy on validation data = 0.7; number of correctly classified validation samples = 70;
criterion = entropy; max_depth = 5; max_features = sqrt; accuracy on validation data = 0.69; number of correctly classified validation samples = 69;
criterion = entropy; max_depth = 10; max_features = sqrt; accuracy on validation data = 0.68; number of correctly classified validation samples = 68;
Optimal parameters: ('entropy', 'sqrt', 10, 68, 0.7200435155047475)
