In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm, bernoulli, multinomial, gaussian_kde, shapiro, kstest, poisson

In [2]:
cleavland = pd.read_csv('05-heart+disease/processed.cleveland.data', header=None, encoding='ISO-8859-1')
hungary = pd.read_csv('05-heart+disease/processed.hungarian.data', header=None, encoding='ISO-8859-1')
switzerland = pd.read_csv('05-heart+disease/processed.switzerland.data', header=None, encoding='ISO-8859-1')
va = pd.read_csv('05-heart+disease/processed.va.data', header=None, encoding='ISO-8859-1')

In [3]:
attributes = ["Age", "Sex", "Chest Pain Type", "Resting Blood Pressure", "Cholesterol", "Fasting Blood Sugar", "Resting ECG", "Max Heart Rate", "Exercise Induced Angina", "ST Depression", "Slope", "Number of Major Vessels", "Thal", "Diagnosis of Heart Disease"]

cleavland.columns = attributes
hungary.columns = attributes
switzerland.columns = attributes
va.columns = attributes

In [4]:
df = pd.concat([cleavland, hungary, switzerland, va], ignore_index=True)

In [5]:
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)
df = df.apply(pd.to_numeric)

In [None]:
# Initialize the dictionary to store the distribution type for each attribute
attribute_distributions = {}

# Iterate over each attribute to determine its distribution type
for attribute in df.columns:

    unique_values = df[attribute].unique()
    
    if len(unique_values) == 2:
        attribute_distributions[attribute] = 'bernoulli'
    elif len(unique_values) < 10:
        # Test if the attribute is Poisson distributed
        mean = df[attribute].mean()
        ks_test = kstest(df[attribute], 'poisson', args=(mean,))
        if ks_test.pvalue > 0.05:
            attribute_distributions[attribute] = 'poisson'
        else:
            attribute_distributions[attribute] = 'multinomial'
    else:
        # Test if the attribute is normally distributed
        shapiro_test = shapiro(df[attribute])
        if shapiro_test.pvalue > 0.05:
            attribute_distributions[attribute] = 'normal'
        else:
            attribute_distributions[attribute] = 'kde'

In [None]:
classes = df.iloc[:, -1].unique() # M
total = len(df)                   # N
attributes = df.columns           # X_i i \in {1, 2, ...}

Iwi = []        # Table of instances for each class
pwi = []        # a priori probability
pxjIwi = []     # p(X_j|w_i) for each class

for i in range(len(classes)):                   # For every class

    Iwi.append(df[df.iloc[:, -1] == i])         # Append the instances of the class
    pwi.append(len(Iwi[i])/total)               # Append the a priori probability
    
    pxjIwi.append([])                           # Append an empty list for the conditional probability - chat

    for j in range(len(attributes)):        # For every attribute except the last one (target variable)

        if attribute_distributions[attributes[j]] == 'bernoulli':
            theta = Iwi[i][attributes[j]].mean()    # Calculate the mean
            un_pxiIwi = bernoulli(theta)            # Create a Bernoulli distribution

        elif attribute_distributions[attributes[j]] == 'poisson':
            lambda_ = Iwi[i][attributes[j]].mean()  # Calculate the mean
            un_pxiIwi = poisson(lambda_)            # Create a Poisson distribution

        elif attribute_distributions[attributes[j]] == 'multinomial':
            un_pxiIwi = Iwi[i][attributes[j]].value_counts(normalize=True)  # Calculate the probability
            un_pxiIwi = multinomial(1, un_pxiIwi)                        # Create a Multinomial distribution 

        elif attribute_distributions[attributes[j]] == 'normal':
            xbar = Iwi[i][attributes[j]].mean()
            s = Iwi[i][attributes[j]].std()
            un_pxiIwi = norm(xbar, s)             # Create a Normal distribution

        elif attribute_distributions[attributes[j]] == 'kde':
            un_pxiIwi = gaussian_kde(df[attributes[j]]) # Calculate the probability

        pxjIwi[i].append(un_pxiIwi)                    # Append the conditional probability

In [8]:
# print the distribution of each attribute

for i in range(len(classes)):
    print(f'Class {i}')
    for j in range(len(attributes)):
        print(f'{attributes[j]}: {attribute_distributions[attributes[j]]}')
    print()

Class 0
Age: kde
Sex: bernoulli
Chest Pain Type: multinomial
Resting Blood Pressure: kde
Cholesterol: kde
Fasting Blood Sugar: bernoulli
Resting ECG: multinomial
Max Heart Rate: kde
Exercise Induced Angina: bernoulli
ST Depression: kde
Slope: multinomial
Number of Major Vessels: multinomial
Thal: multinomial
Diagnosis of Heart Disease: multinomial

Class 1
Age: kde
Sex: bernoulli
Chest Pain Type: multinomial
Resting Blood Pressure: kde
Cholesterol: kde
Fasting Blood Sugar: bernoulli
Resting ECG: multinomial
Max Heart Rate: kde
Exercise Induced Angina: bernoulli
ST Depression: kde
Slope: multinomial
Number of Major Vessels: multinomial
Thal: multinomial
Diagnosis of Heart Disease: multinomial

Class 2
Age: kde
Sex: bernoulli
Chest Pain Type: multinomial
Resting Blood Pressure: kde
Cholesterol: kde
Fasting Blood Sugar: bernoulli
Resting ECG: multinomial
Max Heart Rate: kde
Exercise Induced Angina: bernoulli
ST Depression: kde
Slope: multinomial
Number of Major Vessels: multinomial
Thal: 

In [42]:
def numtoarray(num, size):
    array = [0] * size
    array[num] = 1
    return array

def diagnostic(age, sex, chest_pain_type, resting_blood_pressure, cholesterol, fasting_blood_sugar, resting_ecg, max_heart_rate, exercise_induced_angina, st_depression, slope, number_of_major_vessels, thal):
    
    thalarray = [0, 0, 0]
    if thal == 3:
        thalarray[0] = 1
    elif thal == 6:
        thalarray[1] = 1
    elif thal == 7:
        thalarray[2] = 1

    Pwis = []

    for clase in range(len(classes)):

        # Tienen que ir adentro porque los de la clase 3 no tienen Chest Pain Type 1
        cptarray = numtoarray(chest_pain_type - 1, Iwi[clase]["Chest Pain Type"].unique().size)
        recgarray = numtoarray(resting_ecg, Iwi[clase]["Resting ECG"].unique().size)
        slopearray = numtoarray(slope - 1, Iwi[clase]["Slope"].unique().size)
        nomvarray = numtoarray(number_of_major_vessels - 1, Iwi[clase]["Number of Major Vessels"].unique().size)
        
        Page = pxjIwi[clase][0].pdf(age)
        Psex = pxjIwi[clase][1].pmf(sex)
        Pcpt = pxjIwi[clase][2].pmf(cptarray)
        Prbp = pxjIwi[clase][3].pdf(resting_blood_pressure)
        Pcho = pxjIwi[clase][4].pdf(cholesterol)
        Pfbs = pxjIwi[clase][5].pmf(fasting_blood_sugar)
        Prec = pxjIwi[clase][6].pmf(recgarray)
        Pmhr = pxjIwi[clase][7].pdf(max_heart_rate)
        Peia = pxjIwi[clase][8].pmf(exercise_induced_angina)
        Pstd = pxjIwi[clase][9].pdf(st_depression)
        Pslo = pxjIwi[clase][10].pmf(slopearray)
        Pnmv = pxjIwi[clase][11].pmf(nomvarray)
        Ptal = pxjIwi[clase][12].pmf(thalarray)

        Pwi = pwi[clase] * Page * Psex * Pcpt * Prbp * Pcho * Pfbs * Prec * Pmhr * Peia * Pstd * Pslo * Pnmv * Ptal
        Pwis.append(Pwi)

        print(f"Diagnosis of heart disease {clase} is {Pwi}")

    argmax = max(Pwis)
    
    print(f"Diagnosis of heart disease is {Pwis.index(argmax)}")



In [45]:
Iwi[0].head()

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting ECG,Max Heart Rate,Exercise Induced Angina,ST Depression,Slope,Number of Major Vessels,Thal,Diagnosis of Heart Disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0


In [47]:
diagnostic(20, 1, 1, 145, 233, 1, 1, 150, 0, 2.3, 3, 0, 6)

Diagnosis of heart disease 0 is [9.33199339e-19]
Diagnosis of heart disease 1 is [8.18654819e-19]
Diagnosis of heart disease 2 is [5.34511697e-18]
Diagnosis of heart disease 3 is [6.13597612e-18]
Diagnosis of heart disease 4 is [3.03829637e-19]
Diagnosis of heart disease is 3
