# Heart disease

https://archive.ics.uci.edu/ml/datasets/heart+Disease

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from IPython.display import display
%matplotlib inline


# required for importin modules from other directories
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
from common import misc
from common.data_parser import *
from config import *
from heart_helpers import *

plt.style.use("seaborn")

OUT_DIR = "out/"
SET = "binary" #"binary" or "multi"

In [None]:
def print_styling():
    SMALL_SIZE = 14
    MEDIUM_SIZE = 14
    BIGGER_SIZE = 26

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
from scipy.stats import entropy

def entropy_of_df(labels, base=None):
  value,counts = np.unique(labels, return_counts=True)
  return entropy(counts, base=base)

In [None]:
df = parse_heart_disease(SET)
df.head()

In [None]:
df.describe()

In [None]:
num_samples, num_cols = df.shape
num_samples, num_cols
cols = df.columns
feats = cols.drop("target")
feats

# Histograms

In [None]:
entr = entropy_of_df(df["target"], base=2)

print_styling()
bins = [-0.4,0.4, 0.6, 1.4, 1.6, 2.4, 2.6, 3.4, 3.6, 4.4] if SET=="multi" else [-0.4, 0.4, 0.6, 1.4]
df["target"].hist(bins=bins)
fig = plt.gcf()
ax = plt.gca()
ax.set_xticks((0,1,2,3,4)) if SET=="multi" else ax.set_xticks((0,1))
ax.set_xlabel("target")
fig.suptitle(f"Target histogramm ({SET} class)");
plt.title(f"Entropy {entr:.5}")
plt.savefig(OUT_DIR+f"{SET}_target_hist.pdf")
plt.show();

In [None]:
df.hist(figsize=(14,12));

In [None]:
#mpl.rcParams['legend.facecolor'] = 'white'
var = "probability"

fig, ax = plt.subplots(figsize=(10,8))
for i in [4,3,2,1,0]:
    sns.histplot(data=df[df["target"]==i], x="target", hue="cp", multiple="stack", stat=var, ax=ax, shrink=0.5);
ax.set_xlim(-0.5,5)
plt.show()
plt.clf()


In [None]:
fig, ax = plt.subplots(figsize=(10,8))
for feat in feats:
    sns.histplot(data=df, x=feat, hue="target", multiple="stack");
    plt.show()

In [None]:
df_nozero = df[df["target"]!=0]
df_nozero.info()

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
for feat in feats:
    sns.histplot(data=df_nozero, x=feat, hue="target", multiple="stack");
    plt.show()

In [None]:
#mpl.rcParams['legend.facecolor'] = 'white'
var = "probability"

fig, ax = plt.subplots(figsize=(10,8))
for i in [4,3,2,1,0]:
    sns.histplot(data=df_nozero[df_nozero["target"]==i], x="target", hue="cp", multiple="stack", stat=var, ax=ax, shrink=0.5);
ax.set_xlim(-0.5,5)
plt.show()
plt.clf()

# Missing values

At this point, I noticed there actually were some missing values...there are a couple "?"s in the data, but only in 2 different columns

In [None]:
hc = heart_columns()
df_nans = df.replace(to_replace="?", value=np.nan)
df_nans.info()

In [None]:
count_values(df, ["ca"], mode=True,norm=False);
display(hc["ca"])
df[df["ca"] == "?"]

In [None]:
count_values(df, ["thal"], mode=True,norm=False);
display(hc["thal"])
df[df["thal"] == "?"]

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
for feat in feats:
    sns.histplot(data=df, x=feat, hue="thal", multiple="stack");
    plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
for feat in feats:
    sns.histplot(data=df, x=feat, hue="ca", multiple="stack");
    plt.show()