In [1]:
# -- PART 0: Import Statements
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder 

In [2]:
# -- PART 1: Load the Data

cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
df = pd.read_csv('processed.cleveland.data', names = cols)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


DATA NOTES

INTEGER VARIABLES
- **age:** in years 
- **trestbps:** resting blood pressure (mm Hg)
- **chol:** serum cholesterol (mg/dl)
- **thalac:** maximum heart rate achieved
- **old peak:** ST depressed induced by exercise relative to rest
- **ca:** number of major vessels 

CATEGORICAL VARIABLES

Nominal (ranking doesn't matter)
- **sex**: (0 = female, 1 = male)
- **fbs:** fasting blood sugar (0 = not fasting, 1 = fasting)
- **exang:** exercise-induced ST-depression relative to rest (0 = no chest pain, 1 = chest pain)
- **num**: Label! (0 = no heart disease present, 1-4 = heart disease present)

Ordinal (ranking matters) 
- **cp:** chest pain type (0 = typical agina (heart-related chest pain), 1 = atypical agina, 2 = non-anginal pain, 3 = asymptomatic)
- **restecg:** resting ecg (0 = normal, 1 = ST-T wave abnormality, 2 = left ventricular hypertrophy)
- **slope:** slope of peak exercise ST segment (0 = upsloping, 1 = flat, 2 = downsloping) [TK - <- category 3??]
- **thal:** thalassemia (blood disorder indicator) (0 = normal, 1 = fixed defect, 2 = reversible defect) [TK - <- category 6/7??]

In [None]:
# -- PART 2: Data Preprocessing 

# ---- (a) Handle Missing Values 

# check for ? values in dataset
df.replace('?', np.nan, inplace=True)

# check for NaN values in the dataset
df.isna().sum()

In [None]:
# 4 missing for ca and 2 for thal, can impute with median, but choosing to simply drop due to sample size
df = df.dropna(subset = ['ca', 'thal'])

In [None]:
# ---- (b) create new column class (label) that converts num to a binary classification problem 
df['class'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,class
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0


In [None]:
# ---- (c) handle variable data types 

# check data types 
df.dtypes

In [None]:
# cast data 

# integer variables
df['age'] = df['age'].astype('int')
df['trestbps'] = df['trestbps'].astype('int')
df['chol'] = pd.to_numeric(df['thal']).astype('int')
df['thalach'] = pd.to_numeric(df['thalach']).astype('int')
df['ca'] = pd.to_numeric(df['ca']).astype('int')

# categorical variables (nominal)
df['sex'] = df['sex'].astype('int')
df['fbs'] = df['fbs'].astype('int')
df['exang'] = df['exang'].astype('int')

# categorical variables (ordinal)
df['cp'] = df['cp'].astype('int')
df['restecg'] = df['restecg'].astype('int')
df['slope'] = df['slope'].astype('int')
df['thal'] = pd.to_numeric(df['thal']).astype('int')

df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,class
0,63,1,1,145,6,1,2,150,0,2.3,3,0,6,0,0
1,67,1,4,160,3,0,2,108,1,1.5,2,3,3,2,1
2,67,1,4,120,7,0,2,129,1,2.6,2,2,7,1,1
3,37,1,3,130,3,0,0,187,0,3.5,3,0,3,0,0
4,41,0,2,130,3,0,2,172,0,1.4,1,0,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57,0,4,140,7,0,0,123,1,0.2,2,0,7,1,1
298,45,1,1,110,7,0,0,132,0,1.2,2,0,7,1,1
299,68,1,4,144,7,1,0,141,0,3.4,2,2,7,2,1
300,57,1,4,130,7,0,0,115,1,1.2,2,1,7,3,1


In [9]:
# -- PART 3: Data Visualization 

In [10]:
# -- PART 4: Model Training 

In [11]:
# -- PART 5: Model Evaluation