In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score

In [3]:
df = pd.read_csv("heart_statlog_cleveland_hungary_final.csv")
df

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1185,45,1,1,110,264,0,0,132,0,1.2,2,1
1186,68,1,4,144,193,1,0,141,0,3.4,2,1
1187,57,1,4,130,131,0,0,115,1,1.2,2,1
1188,57,0,2,130,236,0,2,174,0,0.0,2,1


In [5]:
print(df.head())

   age  sex  chest pain type  resting bp s  cholesterol  fasting blood sugar  \
0   40    1                2           140          289                    0   
1   49    0                3           160          180                    0   
2   37    1                2           130          283                    0   
3   48    0                4           138          214                    0   
4   54    1                3           150          195                    0   

   resting ecg  max heart rate  exercise angina  oldpeak  ST slope  target  
0            0             172                0      0.0         1       0  
1            0             156                0      1.0         2       1  
2            1              98                0      0.0         1       0  
3            0             108                1      1.5         2       1  
4            0             122                0      0.0         1       0  


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB


In [9]:
print(df["target"].value_counts())

target
1    629
0    561
Name: count, dtype: int64


In [11]:
x = df.drop("target",axis = 1)
y = df ["target"]
print(x.shape,y.shape)

(1190, 11) (1190,)


In [17]:
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [19]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size = 0.2)
print(X_train.shape,X_test.shape)

(952, 11) (238, 11)


In [21]:
model = DecisionTreeClassifier(max_depth = 20, min_samples_split = 4,min_samples_leaf=2)
model.fit(X_train,Y_train)

In [23]:
Y_train_pred = model.predict(X_train)
Y_test_pred =  model.predict(X_test)

In [25]:
print("accuracy:",accuracy_score(Y_train,Y_train_pred)*100,"%")
print("accuracy:",accuracy_score(Y_test,Y_test_pred)*100,"%")

accuracy: 97.05882352941177 %
accuracy: 80.25210084033614 %


In [27]:
model_RF = RandomForestClassifier(n_estimators = 20)
model_RF.fit(X_train,Y_train)

In [29]:
Y_train_pred = model_RF.predict(X_train)
Y_test_pred = model_RF.predict(X_test)

In [31]:
print("accuracy:",accuracy_score(Y_train,Y_train_pred)*100,"%")
print("accuracy:",accuracy_score(Y_test,Y_test_pred)*100,"%")

accuracy: 99.68487394957984 %
accuracy: 88.65546218487394 %


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [35]:
param_grid = {
    "max_depth":[5,10,15,20],
    "min_samples_split":[2,4,6],
    "min_samples_leaf":[1,2,4]
}

In [37]:
dt_model = DecisionTreeClassifier()

In [39]:
grid_search = GridSearchCV(estimator = dt_model,
                           param_grid = param_grid,
                           scoring = "accuracy",
                           cv = 5,
                           verbose = 1,
                           n_jobs = -1)

In [41]:
grid_search.fit(X_train,Y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [43]:
print("Best Parameters:",grid_search.best_params_)

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 4}


In [45]:
best_model = grid_search.best_estimator_
train_pred = best_model.predict(X_train)
test_pred = best_model.predict(X_test)

In [47]:
print("Train Accuracy:", accuracy_score(Y_train,train_pred)*100,"%")
print("Test Accuracy:",accuracy_score(Y_test,test_pred)*100,"%")

Train Accuracy: 98.21428571428571 %
Test Accuracy: 86.1344537815126 %
