# la Resting ECG de la clase dos es bernoulli! y la estamos tratando como multinomial

# Es mejor que cada una tenga su distribucion

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import norm, bernoulli, multinomial, gaussian_kde, shapiro, kstest, poisson
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn import metrics

from sklearn.metrics import classification_report

In [2]:
cleavland = pd.read_csv('05-heart+disease/processed.cleveland.data', header=None, encoding='ISO-8859-1')
hungary = pd.read_csv('05-heart+disease/processed.hungarian.data', header=None, encoding='ISO-8859-1')
switzerland = pd.read_csv('05-heart+disease/processed.switzerland.data', header=None, encoding='ISO-8859-1')
va = pd.read_csv('05-heart+disease/processed.va.data', header=None, encoding='ISO-8859-1')

In [3]:
attributes = ["Age", "Sex", "Chest Pain Type", "Resting Blood Pressure", "Cholesterol", "Fasting Blood Sugar", "Resting ECG", "Max Heart Rate", "Exercise Induced Angina", "ST Depression", "Slope", "Number of Major Vessels", "Thal", "Diagnosis of Heart Disease"]

cleavland.columns = attributes
hungary.columns = attributes
switzerland.columns = attributes
va.columns = attributes

In [4]:
df = pd.concat([cleavland, hungary, switzerland, va], ignore_index=True)

In [None]:
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)
df = df.apply(pd.to_numeric)

# sort df by class, later the Iwi[i] will be sorted by this order
df = df.sort_values(by=[df.columns[-1]])
df = df.reset_index(drop=True)

In [None]:
def get_distribution(df):
    
    classes = df.iloc[:, -1].unique() # M
    total = len(df)                   # N
    attributes = df.columns           # X_i i \in {1, 2, ...}

    Iwi = []        # Table of instances for each class
    pwi = []        # a priori probability
    pxjIwi = []     # p(X_j|w_i) for each class

    for i in range(len(classes)):                   # For every class

        Iwi.append(df[df.iloc[:, -1] == classes[i]])         # Append the instances of the class
        pwi.append(len(Iwi[i])/total)               # Append the a priori probability
        
        pxjIwi.append([])                           # Append an empty list for the conditional probability - chat

        for attribute in attributes[:-1]:        # For every attribute except the last one (target variable)

            unique_values = Iwi[i][attribute].unique()

            # Poisson distribution
            if len(unique_values) < 10:
            # Test if the attribute is Poisson distributed
                mean = Iwi[i][attribute].mean()
                ks_test = kstest(Iwi[i][attribute], 'poisson', args=(mean,))    # - test if the attribute is Poisson distributed
                if ks_test.pvalue > 0.05:
                    lambda_ = Iwi[i][attribute].mean()  # Calculate the mean
                    un_pxiIwi = poisson(lambda_)            # Create a Poisson distribution

            # Multinomial distribution

            # Normal distribution
            else: 
                shapiro_test = shapiro(Iwi[i][attribute])
                if shapiro_test.pvalue > 0.05:
                    xbar = Iwi[i][attribute].mean()
                    s = Iwi[i][attribute].std()
                    un_pxiIwi = norm(xbar, s)             # Create a Normal distribution

            # KDE distribution
                else:
                    un_pxiIwi = gaussian_kde(Iwi[i][attribute]) # Calculate the probability

            pxjIwi[i].append(un_pxiIwi)                    # Append the conditional probability

    return pxjIwi, pwi, Iwi, classes, attributes

In [9]:
def probability_categorical(attribute, value, Iwi):
    exitoIwi = len(Iwi[Iwi[attribute] == value])
    nIwi = len(Iwi)
    p = exitoIwi/nIwi
    return p

In [None]:
def diagnostic(age, sex, chest_pain_type, resting_blood_pressure, cholesterol, fasting_blood_sugar, resting_ecg, max_heart_rate, exercise_induced_angina, st_depression, slope, number_of_major_vessels, thal):

    pxjIwi, pwi, Iwi, classes, attributes = get_distribution(df)

    Pwis = []

    for clase in range(len(classes)):

        # Tienen que ir adentro porque los de la clase 3 no tienen Chest Pain Type 1

        Page = pxjIwi[clase][0].pdf(age)
        Psex = probability_categorical("Sex", sex, Iwi[clase])
        Pcpt = probability_categorical("Chest Pain Type", chest_pain_type, Iwi[clase])
        Prbp = pxjIwi[clase][3].pdf(resting_blood_pressure)
        Pcho = pxjIwi[clase][4].pdf(cholesterol)
        Pfbs = probability_categorical("Fasting Blood Sugar", fasting_blood_sugar, Iwi[clase])
        Prec = probability_categorical("Resting ECG", resting_ecg, Iwi[clase])
        Pmhr = pxjIwi[clase][7].pdf(max_heart_rate)
        Peia = probability_categorical("Exercise Induced Angina", exercise_induced_angina, Iwi[clase])
        Pstd = pxjIwi[clase][9].pdf(st_depression)
        Pslo = probability_categorical("Slope", slope, Iwi[clase])
        Pnmv = probability_categorical("Number of Major Vessels", number_of_major_vessels, Iwi[clase])
        # Ptal = pxjIwi[clase][12].evaluate(thal)
        
        Pwi = pwi[clase] * Page * Psex * Pcpt * Prbp * Pcho * Pfbs * Prec * Pmhr * Peia * Pstd * Pslo * Pnmv # * Ptal

        if type(Pwi) == np.ndarray:
            Pwi = Pwi[0]
        Pwis.append(Pwi)

        # print(f"Diagnosis of heart disease {clase} is {Pwi}")

    argmax = np.argmax(Pwis)
    print(f"The most likely diagnosis is {argmax}")

    return argmax

In [2]:
def cross_validation(df):
    pxjIwi, pwi, Iwi, classes, attributes = get_distribution(df)

    for clase in range(len(classes)):
        Iwi[clase] = Iwi[clase].sample(frac=1).reset_index(drop=True)   # Shuffle the rows

    train = []
    test = []

    for clase in range(len(classes)):
        train.append(Iwi[clase].iloc[:  int(len(Iwi[clase])*0.8), :])   # 80% of the rows
        test.append(Iwi[clase].iloc[int(len(Iwi[clase])*0.8):, :])      # 20% of the rows

    df = pd.concat([train[0], train[1], train[2], train[3], train[4]], ignore_index=True)

    dist2 = get_distribution(df)

    diagnostic(*test[0].iloc[0, :-1]) # chat. The * unpacks the values of the row, so it's like diagnostic(test[0].iloc[0, 0], test[0].iloc[0, 1], ...)

    alltest = pd.concat([test[0], test[1], test[2], test[3], test[4]], ignore_index=True)

    predictions = []

    for i in range(len(alltest)):
        predictpredict = diagnostic(*alltest.iloc[i, :-1])
        predictions.append(predictpredict)

    # add the predictions to the dataframe
    alltest["PREDICT PREDICT"] = predictions

    return alltest

In [None]:
def confusion_matrix(df):

    alltest = cross_validation(df)
    # True labels
    y_true = alltest.iloc[:, -2] 

    # Predicted labels
    y_pred = alltest.iloc[:, -1]

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    print(cm)

    # Display the confusion matrix
    fig, ax = plt.subplots(figsize=(4, 4))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1, 2, 3, 4])
    disp.plot(cmap="OrRd", ax=ax, colorbar=False)
    plt.show()



    # Balanced accuracy
    metrics.balanced_accuracy_score(y_true, y_pred)

    # Precision
    metrics.precision_score(y_true, y_pred,
                            average=None)

    # Recall
    metrics.recall_score(y_true, y_pred,
                        average=None)

    # F1-score
    metrics.f1_score(y_true, y_pred,
                    average=None)


    #F-beta score
    beta_1 = metrics.fbeta_score(y_true, y_pred,
                        average=None,
                        beta=1)
    beta_2 = metrics.fbeta_score(y_true, y_pred,
                        average=None,
                        beta=.1)
    print("beta=1", beta_1)
    print("beta=2", beta_2)

    # Display classification report
    report = classification_report(y_true, y_pred)
    print(report)
