# Preprocessing
### Mary Kate Montgomery
### Script to pull 4 datasets together, perform some basic preprocessing, and save to file

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read data for all 4 datasets
df_cle = pd.read_csv("processed.cleveland.data",header=None,names=["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","num"])
df_va = pd.read_csv("processed.va.data",header=None,names=["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","num"])
df_sui = pd.read_csv("processed.switzerland.data",header=None,names=["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","num"])
df_hu = pd.read_csv("processed.hungarian.data",header=None,names=["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","num"])

In [3]:
# Add source column
df_cle['loc']='Cleveland Clinic Foundation'
df_va['loc']='V.A. Medical Center, Long Beach, CA'
df_sui['loc']= 'University Hospital, Zurich, Switzerland'
df_hu['loc']='Hungarian Institute of Cardiology, Budapest'

In [4]:
# Merge data into single dataframe
df = pd.concat([df_cle,df_va,df_sui,df_hu],ignore_index=True)

In [5]:
# Replace question marks with NaNs
df.replace('?',np.nan,inplace=True)

In [6]:
# Set datatypes (not all sources read as same type)
df = df.astype({"age":np.float64,"sex":np.float64,"cp":np.float64,"trestbps":np.float64,"chol":np.float64,
           "fbs":np.float64,"restecg":np.float64,"thalach":np.float64,"exang":np.float64,
           "oldpeak":np.float64,"slope":np.float64,"ca":np.float64,"thal":np.float64,"num":np.int32})

In [7]:
# Create target column and drop num
df['target']=df['num'].apply(lambda x: 0 if x==0 else 1)
df.drop(columns=['num'],axis=1,inplace=True)

In [8]:
# Change values to be more interpretable 
df.loc[df['sex']==1,'sex'] = 'M'
df.loc[df['sex']==0,'sex'] = 'F'
df.loc[df['target']==1,'target'] = 'Heart Disease'
df.loc[df['target']==0,'target'] = 'No Heart Disease'
df.rename(columns= {'cp':'chest pain','exang':'exercise induced angina','thal':'thallium stress test','ca':'coronary artery','thalach':'max heart rate','trestbps':'blood pressure','chol':'cholesterol','fbs':'blood sugar'},inplace=True)

In [9]:
df.head()

Unnamed: 0,age,sex,chest pain,blood pressure,cholesterol,blood sugar,restecg,max heart rate,exercise induced angina,oldpeak,slope,coronary artery,thallium stress test,loc,target
0,63.0,M,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,Cleveland Clinic Foundation,No Heart Disease
1,67.0,M,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,Cleveland Clinic Foundation,Heart Disease
2,67.0,M,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,Cleveland Clinic Foundation,Heart Disease
3,37.0,M,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,Cleveland Clinic Foundation,No Heart Disease
4,41.0,F,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,Cleveland Clinic Foundation,No Heart Disease


In [10]:
# Write parsed dataset to single csv
df.to_csv("alldata.csv",index=False)