# la Resting ECG de la clase dos es bernoulli! y la estamos tratando como multinomial

# Es mejor que cada una tenga su distribucion

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import norm, bernoulli, multinomial, gaussian_kde, shapiro, kstest, poisson

In [3]:
cleavland = pd.read_csv('05-heart+disease/processed.cleveland.data', header=None, encoding='ISO-8859-1')
hungary = pd.read_csv('05-heart+disease/processed.hungarian.data', header=None, encoding='ISO-8859-1')
switzerland = pd.read_csv('05-heart+disease/processed.switzerland.data', header=None, encoding='ISO-8859-1')
va = pd.read_csv('05-heart+disease/processed.va.data', header=None, encoding='ISO-8859-1')

In [4]:
attributes = ["Age", "Sex", "Chest Pain Type", "Resting Blood Pressure", "Cholesterol", "Fasting Blood Sugar", "Resting ECG", "Max Heart Rate", "Exercise Induced Angina", "ST Depression", "Slope", "Number of Major Vessels", "Thal", "Diagnosis of Heart Disease"]

cleavland.columns = attributes
hungary.columns = attributes
switzerland.columns = attributes
va.columns = attributes

In [5]:
df = pd.concat([cleavland, hungary, switzerland, va], ignore_index=True)

In [6]:
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)
df = df.apply(pd.to_numeric)

In [20]:
classes = df.iloc[:, -1].unique() # M
total = len(df)                   # N
attributes = df.columns           # X_i i \in {1, 2, ...}

Iwi = []        # Table of instances for each class
pwi = []        # a priori probability
pxjIwi = []     # p(X_j|w_i) for each class

for i in range(len(classes)):                   # For every class

    Iwi.append(df[df.iloc[:, -1] == classes[i]])         # Append the instances of the class
    pwi.append(len(Iwi[i])/total)               # Append the a priori probability
    
    pxjIwi.append([])                           # Append an empty list for the conditional probability - chat

    for attribute in attributes[:-1]:        # For every attribute except the last one (target variable)

        unique_values = Iwi[i][attribute].unique()

        # Poisson distribution
        if len(unique_values) < 10:
        # Test if the attribute is Poisson distributed
            mean = Iwi[i][attribute].mean()
            ks_test = kstest(Iwi[i][attribute], 'poisson', args=(mean,))
            if ks_test.pvalue > 0.05:
                lambda_ = Iwi[i][attribute].mean()  # Calculate the mean
                un_pxiIwi = poisson(lambda_)            # Create a Poisson distribution

        # Multinomial distribution
            else:
                un_pxiIwi = Iwi[i][attribute].value_counts(normalize=True)  # Calculate the probability
                un_pxiIwi = multinomial(1, un_pxiIwi)                        # Create a Multinomial distribution 

        # Normal distribution
        else: 
            shapiro_test = shapiro(Iwi[i][attribute])
            if shapiro_test.pvalue > 0.05:
                xbar = Iwi[i][attribute].mean()
                s = Iwi[i][attribute].std()
                un_pxiIwi = norm(xbar, s)             # Create a Normal distribution

        # KDE distribution
            else:
                un_pxiIwi = gaussian_kde(Iwi[i][attribute]) # Calculate the probability

        pxjIwi[i].append(un_pxiIwi)                    # Append the conditional probability

In [34]:
def numtoarray(num, size):
    array = [0] * size
    array[num] = 1
    return array

def diagnostic(age, sex, chest_pain_type, resting_blood_pressure, cholesterol, fasting_blood_sugar, resting_ecg, max_heart_rate, exercise_induced_angina, st_depression, slope, number_of_major_vessels, thal):
    
    thalarray = [0, 0, 0]
    if thal == 3:
        thalarray[0] = 1
    elif thal == 6:
        thalarray[1] = 1
    elif thal == 7:
        thalarray[2] = 1

    Pwis = []

    for clase in range(len(classes)):

        # Tienen que ir adentro porque los de la clase 3 no tienen Chest Pain Type 1

        # Se convierten los valores a arrays de 0s y 1s
        sexarray = numtoarray(sex, Iwi[clase]["Sex"].unique().size)
        fbsarray = numtoarray(fasting_blood_sugar, Iwi[clase]["Fasting Blood Sugar"].unique().size)
        eiaarray = numtoarray(exercise_induced_angina, Iwi[clase]["Exercise Induced Angina"].unique().size)

        cptarray = numtoarray(chest_pain_type - 1, Iwi[clase]["Chest Pain Type"].unique().size)
        recgarray = numtoarray(resting_ecg-1, Iwi[clase]["Resting ECG"].unique().size)
        
        slopearray = numtoarray(slope - 1, Iwi[clase]["Slope"].unique().size)
        nomvarray = numtoarray(number_of_major_vessels - 1, Iwi[clase]["Number of Major Vessels"].unique().size)
        
        Page = pxjIwi[clase][0].pdf(age)
        Psex = pxjIwi[clase][1].pmf(sexarray)
        Pcpt = pxjIwi[clase][2].pmf(cptarray)
        Prbp = pxjIwi[clase][3].pdf(resting_blood_pressure)
        Pcho = pxjIwi[clase][4].pdf(cholesterol)
        Pfbs = pxjIwi[clase][5].pmf(fbsarray)
        Prec = pxjIwi[clase][6].pmf(recgarray)
        Pmhr = pxjIwi[clase][7].pdf(max_heart_rate)
        Peia = pxjIwi[clase][8].pmf(eiaarray)
        Pstd = pxjIwi[clase][9].pdf(st_depression)
        Pslo = pxjIwi[clase][10].pmf(slopearray)
        Pnmv = pxjIwi[clase][11].pmf(nomvarray)
        Ptal = pxjIwi[clase][12].pmf(thalarray)

        listdep = [Page, Psex, Pcpt, Prbp, Pcho, Pfbs, Prec, Pmhr, Peia, Pstd, Pslo, Pnmv, Ptal]
        
        for i in range(len(listdep)):
            if type(listdep[i]) == np.ndarray:
                listdep[i] = listdep[i][0]
                if np.isnan(listdep[i]):
                    listdep[i] = 0

        product = 1
        for i in range(len(listdep)):
            product *= listdep[i]

        Pwi = pwi[clase] * product
        Pwis.append(Pwi)

        print(f"Diagnosis of heart disease {clase} is {Pwi}")

    argmax = np.argmax(Pwis)
    print(f"The most likely diagnosis is {argmax}")

In [35]:
Iwi[0].head(1)

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting ECG,Max Heart Rate,Exercise Induced Angina,ST Depression,Slope,Number of Major Vessels,Thal,Diagnosis of Heart Disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0


In [40]:
diagnostic(63, 1, 1, 145, 233, 1, 2, 150, 0, 2.3, 3, 0, 6)

Diagnosis of heart disease 0 is 2.1504607816211743e-15
Diagnosis of heart disease 1 is 8.648423286501891e-14
Diagnosis of heart disease 2 is 2.6762107320948123e-15
Diagnosis of heart disease 3 is 3.1215439686172804e-14
Diagnosis of heart disease 4 is 2.0428926280977344e-15
The most likely diagnosis is 1


In [24]:
Iwi[2]["Resting ECG"].unique().size

2

In [33]:
numtoarray(2, 3)

[0, 0, 1]

In [31]:
array = [0] * 2
array

[0, 0]

In [23]:
pxjIwi[2][6]

<scipy.stats._multivariate.multinomial_frozen at 0x1688bb23170>