In [1]:
""" Setup Imports """
    # numpy for matrix operations
import numpy as np
    # matplotlib for visualization
from matplotlib import pyplot as plt
    # csv for reading the input data
import csv

In [37]:
""" Import Data """
    # Create a list to hold each row of data
import_data = list()
    # Create a counter for the number of rows
rows = 0

# Import the CSV data
with open('ClevelandClinicData.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar="|")
    for row in reader:
        import_data.append(row)
        rows += 1

In [38]:
# --- Deprocated ---
# Clean Data
row_size = len(import_data[0])
data = np.zeros((rows, row_size))
counter = 0
for row in import_data:
    for i in range(len(row)):
        # Set anything with a question mark to 0
        if row[i] == '?':
            row[i] = 0
        data[counter, i] = row[i]
    counter += 1
# Convert the string data to floats
data = data.astype(np.float)

In [40]:
""" Clean Data """
# Create variable for size of each row
row_size = len(import_data[0])
# Create empty numpy matrix to hold the new data
data = np.zeros((rows, row_size))
# Create a counter to remember how many rows have been added
counter = 0

# For loop - to iterate over each row in dataset
for row in import_data:
    # Boolean to mark if data is bad
    data_good = True
    # For loop - iterate over each item in row
    for i in range(row_size):
        # If statement - check if item is ?
        if row[i] == '?':
            data_good = False
    # If statement - if the data is fine add it to our empty numpy matrix
    if data_good is True:
        data[counter] = row
        # Increment counter
        counter += 1
        
# Convert all of the data in the matrix to floats
data = data.astype(np.float)
# Trim zeros from the end of the data (for points that weren't added)
data = data[:counter]

In [42]:
# Objective - P(HeartDisease|x1, x2, x3, ...)

""" Thresholds:
    0. Age - <65
    1. Sex - Male/Female (1 = male)
    2. Chest Pain - 0/1
    3. Resting BP - >140 - HTN
    4. Serum Cholesterol - >240
    5. Fasting Blood Sugar - <120mg/dl
    6. Resting ECG - !=0 - not normal
    7. Max HR - < 160 - Tachycardia
    8. Induced Angina - 0/1
    9. ST Depression - !=0 - ST Depression
    10. Slope of ST (peak exersise) - !=1 - Slope
    11. CA - Number of major vessels angiogramed != Abnormal
    12. Thalassemia - 3 = normal
    13. Diagnosis of heart disease - 0/1
    
"""
data_processed = data.copy()
# Process Data
for row in data_processed:
    if row[0] > 65:
        row[0] = 1
    else:
        row[0] = 0
    if row[2] != 0:
        row[2] = 1
    if row[3] > 140:
        row[3] = 1
    else:
        row[3] = 0
    if row[4] > 240:
        row[4] = 1
    else:
        row[4] = 0
    if row[6] != 0:
        row[6] = 1
    if row[7] > 160:
        row[7] = 1
    else:
        row[7] = 0
    if row[9] != 0:
        row[9] = 1
    if row[10] != 1:
        row[10] = 1
    else:
        row[10] = 0
    if row[11] != 0:
        row[11] = 1
    if row[12] != 3:
        row[12] = 1
    else:
        row[12] = 0
    if row[13] != 0:
        row[13] = 1

# Calculate P(HeartDisease)
    # This is a probabily using the given sample data
num = 0
hd = 0
p_heartDisease = 0
for row in data:
    if(row[row.shape[0]-1]) > 0:
        hd += 1
    num += 1
p_heartDisease = hd/num

In [44]:
""" Split Dataset """
# Create variable to hold heart disease data
hd_data = np.zeros((hd, data.shape[1]))
# Create variable to hold no-heart disease data
nohd_data = np.zeros((num-hd, data.shape[1]))

# Create variable for count of each
nohd_count = 0
hd_count = 0

# For loop - iterate over each row
for row in data_processed:
    # If statement - last item (label) is 0
    if row[row.shape[0]-1] == 0:
        # Add to no-heart disease set
        nohd_data[nohd_count] = row
        # Increment no-heart disease counter
        nohd_count += 1
    else:
        # Add to heart disease set
        hd_data[hd_count] = row
        # Increment heart disease counter
        hd_count += 1

In [46]:
""" Create method to calculate the probability of heart disease """
def p_hd(input_vector):
    # This vector will keep track of the number of equalities
    count_vector = np.zeros(input_vector.shape[0])
    # Copy the data and remove labels
    hd_data_no_label = hd_data.copy()
    hd_data_no_label = hd_data_no_label[:, :13]
    for row in hd_data_no_label:
        # Increment count vector for every equal entry
        count_vector += np.logical_not(np.logical_xor(row, input_vector))
    # Divide the count vector by the number of entries to get a probability
    count_vector = count_vector/hd_data_no_label.shape[0]
    # Return product of vector times probability of heart disease
    return np.prod(count_vector) * p_heartDisease

In [27]:
def p_no_hd(input_vector):
    count_vector = np.zeros(input_vector.shape[0])
    nohd_data_no_label = nohd_data.copy()
    nohd_data_no_label = nohd_data_no_label[:, :13]
    for row in nohd_data_no_label:
        count_vector += np.logical_not(np.logical_xor(row, input_vector))
    count_vector = count_vector/nohd_data_no_label.shape[0]
    return np.prod(count_vector) * (1-p_heartDisease)

In [28]:
# test_data = np.array('1 1 1 1 1 0 1 0 1 1 1 1 0');
test_data = np.array([1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0])
test_data_nohd = np.array([0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0])
hd = p_hd(test_data)
nohd = p_no_hd(test_data)
print(np.argmax([hd, nohd]))
print("   P Heart Disease: ", hd, "\r\nP No Heart Disease: ", nohd)

0
   P Heart Disease:  0.000175200496361 
P No Heart Disease:  1.90017512529e-06
