In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import scale
from scipy import stats
from sklearn.tree import DecisionTreeClassifier

# Data Explanation
### Predictor:
num: diagnosis of heart disease (angiographic disease status) 
-- Value 0: < 50% diameter narrowing 
-- Value 1: > 50% diameter narrowing 

### Only 14 attributes used: 

age: age in years 
sex: sex (1 = male; 0 = female) 
cp: chest pain type 
-- Value 1: typical angina 
-- Value 2: atypical angina 
-- Value 3: non-anginal pain 
-- Value 4: asymptomatic 
trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
chol: serum cholestoral in mg/dl 
fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
restecg: resting electrocardiographic results 
-- Value 0: normal 
-- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
-- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
thalach: maximum heart rate achieved 
exang: exercise induced angina (1 = yes; 0 = no) 
oldpeak = ST depression induced by exercise relative to rest 
slope: the slope of the peak exercise ST segment 
-- Value 1: upsloping 
-- Value 2: flat 
-- Value 3: downsloping 
ca: number of major vessels (0-3) colored by flourosopy 
thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 

In [2]:
data1 = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data",header=None)
data2 = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/reprocessed.hungarian.data',header=None,delim_whitespace=True)
data3 = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None)
data4 = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.va.data',header=None)
data = data1.append(data2).append(data3).append(data4)
data.columns=['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']
data.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,32.0,1.0,1.0,95,0,?,0,127,0,.7,1,?,?,1
1,34.0,1.0,4.0,115,0,?,?,154,0,.2,1,?,?,1
2,35.0,1.0,4.0,?,0,?,0,130,1,?,?,?,7,3
3,36.0,1.0,4.0,110,0,?,0,125,1,1,2,?,6,1
4,38.0,0.0,4.0,105,0,?,0,166,0,2.8,1,?,?,2
5,38.0,0.0,4.0,110,0,0,0,156,0,0,2,?,3,1
6,38.0,1.0,3.0,100,0,?,0,179,0,-1.1,1,?,?,0
7,38.0,1.0,3.0,115,0,0,0,128,1,0,2,?,7,1
8,38.0,1.0,4.0,135,0,?,0,150,0,0,?,?,3,2
9,38.0,1.0,4.0,150,0,?,0,120,1,?,?,?,3,1


In [3]:
data.shape

(920, 14)

In [4]:
data.isnull().sum().sum()

0

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 199
Data columns (total 14 columns):
age         920 non-null float64
sex         920 non-null float64
cp          920 non-null float64
trestbps    920 non-null object
chol        920 non-null object
fbs         920 non-null object
restecg     920 non-null object
thalach     920 non-null object
exang       920 non-null object
oldpeak     920 non-null object
slope       920 non-null object
ca          920 non-null object
thal        920 non-null object
num         920 non-null int64
dtypes: float64(3), int64(1), object(10)
memory usage: 107.8+ KB


In [6]:
data.describe()

Unnamed: 0,age,sex,cp,num
count,920.0,920.0,920.0,920.0
mean,53.51087,0.78913,3.25,1.133696
std,9.424685,0.408148,0.930969,1.258942
min,28.0,0.0,1.0,0.0
25%,47.0,1.0,3.0,0.0
50%,54.0,1.0,4.0,1.0
75%,60.0,1.0,4.0,2.0
max,77.0,1.0,4.0,4.0


In [7]:
print('The sum of all the ? symbols is',data.trestbps.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.chol.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.fbs.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.restecg.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.thalach.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.exang.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.oldpeak.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.slope.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.ca.str.contains('\?').sum())
print('The sum of all the ? symbols is',data.thal.str.contains('\?').sum())

The sum of all the ? symbols is 58
The sum of all the ? symbols is 7
The sum of all the ? symbols is 82
The sum of all the ? symbols is 1
The sum of all the ? symbols is 54
The sum of all the ? symbols is 54
The sum of all the ? symbols is 62
The sum of all the ? symbols is 119
The sum of all the ? symbols is 320
The sum of all the ? symbols is 220


In [8]:
data.trestbps.replace('?',np.nan,inplace=True)
data.chol.replace('?',np.nan,inplace=True)
data.fbs.replace('?',np.nan,inplace=True)
data.restecg.replace('?',np.nan,inplace=True)
data.thalach.replace('?',np.nan,inplace=True)
data.exang.replace('?',np.nan,inplace=True)
data.oldpeak.replace('?',np.nan,inplace=True)
data.slope.replace('?',np.nan,inplace=True)
data.ca.replace('?',np.nan,inplace=True)
data.thal.replace('?',np.nan,inplace=True)
data.isnull().sum().sum()

977

In [10]:
data.dropna(inplace=True)
data.isnull().sum().sum()

0

In [29]:
data.drop_duplicates()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,40.0,1.0,2.0,140.0,289,0.0,0,172.0,0.0,0.0,-9.0,-9.0,-9.0,0
1,49.0,0.0,3.0,160.0,180,0.0,0,156.0,0.0,1.0,2.0,-9.0,-9.0,1
2,37.0,1.0,2.0,130.0,283,0.0,1,98.0,0.0,0.0,-9.0,-9.0,-9.0,0
3,48.0,0.0,4.0,138.0,214,0.0,0,108.0,1.0,1.5,2.0,-9.0,-9.0,3
4,54.0,1.0,3.0,150.0,-9,0.0,0,122.0,0.0,0.0,-9.0,-9.0,-9.0,0
5,39.0,1.0,3.0,120.0,339,0.0,0,170.0,0.0,0.0,-9.0,-9.0,-9.0,0
6,45.0,0.0,2.0,130.0,237,0.0,0,170.0,0.0,0.0,-9.0,-9.0,-9.0,0
7,54.0,1.0,2.0,110.0,208,0.0,0,142.0,0.0,0.0,-9.0,-9.0,-9.0,0
8,37.0,1.0,4.0,140.0,207,0.0,0,130.0,1.0,1.5,2.0,-9.0,-9.0,1
9,48.0,0.0,2.0,120.0,284,0.0,0,120.0,0.0,0.0,-9.0,-9.0,-9.0,0


In [None]:
np.asarray(data.trestbps.dropna(), dtype=np.float).mean()

In [None]:
data.trestbps.fillna(np.asarray(data.trestbps.dropna(), dtype=np.float).mean(),inplace=True)

In [None]:
print(stats.mode(np.asarray(data.restecg.dropna(),dtype= np.int)).mode)

In [None]:
np.asarray(data.chol.dropna(), dtype=np.float).mean()

In [None]:
data.chol.fillna(np.asarray(data.chol.dropna(), dtype=np.float).mean(),inplace=True)

In [None]:
data.isnull().sum().sum()

In [None]:
print(stats.mode(np.asarray(data.fbs.dropna(),dtype= np.int)).mode[0])

In [None]:
data.fbs.fillna(stats.mode(np.asarray(data.fbs.dropna(),dtype= np.int)).mode[0],inplace=True)

In [None]:
data.isnull().sum().sum()

In [None]:
print(stats.mode(np.asarray(data.restecg.dropna(),dtype= np.int)))

In [None]:
print(stats.mode(np.asarray(data.restecg.dropna(),dtype= np.int)).mode[0])
data.restecg.fillna(stats.mode(np.asarray(data.restecg.dropna(),dtype= np.int)).mode[0],inplace=True)

In [None]:
data.isnull().sum().sum()

In [None]:
np.asarray(data.thalach.dropna(), dtype=np.float).mean()

In [None]:
data.thalach.fillna(np.asarray(data.thalach.dropna(), dtype=np.float).mean(),inplace=True)

In [None]:
data.isnull().sum().sum()

In [None]:
print(stats.mode(np.asarray(data.exang.dropna(),dtype= np.int)))

In [None]:
print(stats.mode(np.asarray(data.exang.dropna(),dtype= np.int)).mode[0])
data.exang.fillna(stats.mode(np.asarray(data.exang.dropna(),dtype= np.int)).mode[0],inplace=True)

In [None]:
data.isnull().sum().sum()

In [None]:
np.asarray(data.oldpeak.dropna(), dtype=np.float).mean()

In [None]:
data.oldpeak.fillna(np.asarray(data.oldpeak.dropna(), dtype=np.float).mean(),inplace=True)

In [None]:
data.isnull().sum().sum()

In [None]:
print(stats.mode(np.asarray(data.slope.dropna(),dtype= np.int)))

In [None]:
print(stats.mode(np.asarray(data.slope.dropna(),dtype= np.int)).mode[0])
data.slope.fillna(stats.mode(np.asarray(data.slope.dropna(),dtype= np.int)).mode[0],inplace=True)

In [None]:
data.isnull().sum().sum()

In [None]:
print(stats.mode(np.asarray(data.ca.dropna(),dtype= np.float)))

In [None]:
print(stats.mode(np.asarray(data.ca.dropna(),dtype= np.float)).mode[0])
data.ca.fillna(stats.mode(np.asarray(data.ca.dropna(),dtype= np.float)).mode[0],inplace=True)

In [None]:
data.isnull().sum().sum()

In [None]:
print(stats.mode(np.asarray(data.thal.dropna(),dtype= np.float)))

In [None]:
print(stats.mode(np.asarray(data.thal.dropna(),dtype= np.float)).mode[0])
data.thal.fillna(stats.mode(np.asarray(data.thal.dropna(),dtype= np.float)).mode[0],inplace=True)

In [None]:
data.isnull().sum().sum()

In [None]:
data.head()

In [56]:
# converting all the data to numeric data type
data.trestbps=pd.to_numeric(data.trestbps)
data.fbs=pd.to_numeric(data.fbs)
data.chol=pd.to_numeric(data.chol)
data.restecg=pd.to_numeric(data.restecg)
data.thalach=pd.to_numeric(data.thalach)
data.exang=pd.to_numeric(data.exang)
data.oldpeak=pd.to_numeric(data.oldpeak)
data.slope=pd.to_numeric(data.slope)
data.ca=pd.to_numeric(data.ca)
data.thal=pd.to_numeric(data.thal)

In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 592 entries, 0 to 28
Data columns (total 14 columns):
age         592 non-null float64
sex         592 non-null float64
cp          592 non-null float64
trestbps    592 non-null float64
chol        592 non-null float64
fbs         592 non-null float64
restecg     592 non-null int64
thalach     592 non-null float64
exang       592 non-null float64
oldpeak     592 non-null float64
slope       592 non-null float64
ca          592 non-null float64
thal        592 non-null float64
num         592 non-null int64
dtypes: float64(12), int64(2)
memory usage: 89.4 KB


In [58]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,592.0,592.0,592.0,592.0,592.0,592.0,592.0,592.0,592.0,592.0,592.0,592.0,592.0,592.0
mean,51.209459,0.701014,3.072635,131.876689,238.743243,-0.015203,0.592905,144.099662,0.300676,0.823142,-1.748311,-4.054054,-1.391892,0.869932
std,9.087656,0.458201,0.968083,18.590793,77.272465,1.096729,0.953123,24.629956,0.60242,1.070646,5.011914,4.908343,7.025345,1.23608
min,28.0,0.0,1.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,0.0,-9.0,-9.0,-9.0,0.0
25%,44.0,0.0,2.0,120.0,205.75,0.0,0.0,127.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
50%,52.0,1.0,3.0,130.0,239.0,0.0,0.0,146.0,0.0,0.2,1.0,0.0,3.0,0.0
75%,58.0,1.0,4.0,140.0,277.0,0.0,2.0,162.0,1.0,1.5,2.0,0.0,6.0,2.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,9.0,7.0,4.0


In [59]:
data.corr()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
age,1.0,-0.06011,0.15117,0.232457,0.129703,0.101673,0.232169,-0.289262,0.132505,0.252934,0.36224,0.398141,0.344871,0.223254
sex,-0.06011,1.0,0.121518,0.02844,-0.056461,0.058282,-0.03896,-0.076969,0.154595,0.097597,-0.008437,-0.044816,0.014001,0.22101
cp,0.15117,0.121518,1.0,0.033404,0.102352,0.005823,0.099951,-0.303603,0.365776,0.279492,0.264752,0.121544,0.141524,0.419386
trestbps,0.232457,0.02844,0.033404,1.0,0.054105,0.070199,0.056176,-0.029026,0.298079,0.161603,0.070249,0.005675,-0.001757,0.185664
chol,0.129703,-0.056461,0.102352,0.054105,1.0,-0.075437,0.120502,-0.029924,0.070869,0.105636,0.176033,0.098016,0.118296,0.164187
fbs,0.101673,0.058282,0.005823,0.070199,-0.075437,1.0,0.070148,0.030624,0.022296,0.061687,0.100742,0.149466,0.092998,0.083413
restecg,0.232169,-0.03896,0.099951,0.056176,0.120502,0.070148,1.0,0.054852,0.045567,0.174729,0.311228,0.42099,0.372612,0.104346
thalach,-0.289262,-0.076969,-0.303603,-0.029026,-0.029924,0.030624,0.054852,1.0,-0.120508,-0.263687,-0.055553,0.186336,0.160088,-0.350715
exang,0.132505,0.154595,0.365776,0.298079,0.070869,0.022296,0.045567,-0.120508,1.0,0.301905,0.282561,0.065019,0.098654,0.38891
oldpeak,0.252934,0.097597,0.279492,0.161603,0.105636,0.061687,0.174729,-0.263687,0.301905,1.0,0.557202,0.245975,0.245196,0.531729


In [60]:
X = data.loc[:,['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']]
X_scaled = pd.DataFrame(scale(X))
y = data.num
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3,random_state=10)
X_train_scaled,X_test_scaled,y_train_scaled,y_test_scaled = train_test_split(X_scaled,y, test_size=0.3,random_state=10)

#y = pd.DataFrame(scale(y))

In [61]:
model_logr = LogisticRegression()
model_logr.fit(X_train,y_train)
y_predict = model_logr.predict(X_test)
print(pd.crosstab(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))

col_0    0  1  2  3  4
num                   
0      101  1  1  0  0
1       20  2  3  5  0
2        7  1  1  5  1
3        8  1  4  2  0
4        4  0  3  8  0
[[101   1   1   0   0]
 [ 20   2   3   5   0]
 [  7   1   1   5   1]
 [  8   1   4   2   0]
 [  4   0   3   8   0]]
0.5955056179775281


In [62]:
model_logr = LogisticRegression()
model_logr.fit(X_train_scaled,y_train_scaled)
y_predict = model_logr.predict(X_test_scaled)
print(pd.crosstab(y_test_scaled,y_predict))
print(confusion_matrix(y_test_scaled,y_predict))
print(accuracy_score(y_test_scaled,y_predict))

col_0   0  1  2  3
num               
0      96  3  3  1
1      20  3  3  4
2       8  1  1  5
3       8  1  3  3
4       2  1  3  9
[[96  3  3  1  0]
 [20  3  3  4  0]
 [ 8  1  1  5  0]
 [ 8  1  3  3  0]
 [ 2  1  3  9  0]]
0.5786516853932584


In [63]:
model_rfc = RandomForestClassifier(max_features=3,random_state=4)
model_rfc.fit(X_train,y_train)
y_predict = model_rfc.predict(X_test)
print(accuracy_score(y_test,y_predict))
print(pd.crosstab(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))

0.6235955056179775
col_0   0  1  2  3  4
num                  
0      98  2  2  1  0
1      18  6  3  3  0
2       8  1  3  3  0
3       5  2  5  3  0
4       2  3  6  3  1
[[98  2  2  1  0]
 [18  6  3  3  0]
 [ 8  1  3  3  0]
 [ 5  2  5  3  0]
 [ 2  3  6  3  1]]


In [64]:
model_rfc = RandomForestClassifier(max_features=3,random_state=4)
model_rfc.fit(X_train_scaled,y_train_scaled)
y_predict = model_rfc.predict(X_test_scaled)
print(accuracy_score(y_test_scaled,y_predict))
print(pd.crosstab(y_test_scaled,y_predict))
print(confusion_matrix(y_test_scaled,y_predict))

0.6292134831460674
col_0   0  1  2  3  4
num                  
0      99  0  3  1  0
1      18  6  3  3  0
2       8  1  3  3  0
3       5  1  6  3  0
4       2  3  6  3  1
[[99  0  3  1  0]
 [18  6  3  3  0]
 [ 8  1  3  3  0]
 [ 5  1  6  3  0]
 [ 2  3  6  3  1]]


In [65]:
model_knn = KNeighborsClassifier(n_neighbors = 2, metric='euclidean')
model_knn.fit(X_train,y_train)
y_predict = model_knn.predict(X_test)
print(confusion_matrix(y_test,y_predict))
print(accuracy_score(y_test,y_predict))

[[89  9  4  1  0]
 [26  2  1  1  0]
 [10  3  0  2  0]
 [ 9  3  3  0  0]
 [10  3  1  1  0]]
0.5112359550561798


In [66]:
model_knn = KNeighborsClassifier(n_neighbors = 2, metric='euclidean')
model_knn.fit(X_train_scaled,y_train_scaled)
y_predict = model_knn.predict(X_test_scaled)
print(confusion_matrix(y_test_scaled,y_predict))
print(accuracy_score(y_test_scaled,y_predict))

[[97  3  2  1  0]
 [19  5  6  0  0]
 [ 8  2  4  1  0]
 [ 7  3  4  1  0]
 [ 4  2  7  1  1]]
0.6067415730337079


In [67]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train,y_train)
y_predict = model_dt.predict(X_test)
print(accuracy_score(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))

0.5393258426966292
[[82 10  2  9  0]
 [10  8  3  8  1]
 [ 7  3  0  3  2]
 [ 4  4  4  3  0]
 [ 0  4  4  4  3]]


In [68]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_scaled,y_train_scaled)
y_predict = model_dt.predict(X_test_scaled)
print(accuracy_score(y_test_scaled,y_predict))
print(confusion_matrix(y_test_scaled,y_predict))

0.5224719101123596
[[82  9  3  7  2]
 [ 9  8  4  8  1]
 [ 7  3  0  4  1]
 [ 4  4  4  3  0]
 [ 1  4  5  5  0]]


In [69]:
model = SVC(C=1000.0, kernel='rbf', gamma=0.1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print(accuracy_score(y_test,y_predict))
print(pd.crosstab(y_test,y_predict))

0.5786516853932584
col_0    0
num       
0      103
1       30
2       15
3       15
4       15


In [70]:
model = SVC(C=1000.0, kernel='rbf', gamma=0.1)
model.fit(X_train_scaled, y_train_scaled)
y_predict = model.predict(X_test_scaled)
print(accuracy_score(y_test_scaled,y_predict))
print(pd.crosstab(y_test_scaled,y_predict))

0.5842696629213483
col_0   0  1  2  3  4
num                  
0      87  9  3  4  0
1      13  9  2  6  0
2       7  3  3  1  1
3       4  3  4  4  0
4       3  4  4  3  1
