In [24]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score,accuracy_score,recall_score,confusion_matrix,precision_score

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [6]:
df['ChestPainType'].value_counts()

ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64

In [12]:
df['ChestPainType']= pd.Categorical(df['ChestPainType']).codes

In [14]:
df['RestingECG']= pd.Categorical(df['RestingECG']).codes

In [16]:
df['ExerciseAngina']= pd.Categorical(df['ExerciseAngina']).codes

In [18]:
df['ST_Slope']= pd.Categorical(df['ST_Slope']).codes

In [19]:
df['Sex'] = pd.Categorical(df['Sex']).codes

In [20]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [22]:
features = df.drop('HeartDisease',axis=1)
target = df['HeartDisease']
x_train , x_test , y_train, y_test = train_test_split(features,target,test_size=0.2,random_state=42)

In [51]:
models = {
    'LogisticRegression':LogisticRegression(max_iter=1000),
    'RidgeClassifier' : RidgeClassifier(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'RandomForestClassifier':RandomForestClassifier(n_estimators=100),
    'XGBClassifier':XGBClassifier()
}
result = pd.DataFrame(index=['SCORE','ACCURACY','PRECISION',"RECALL"])

def regressions(models,x_train,y_train,x_test,y_test):
    for name,model in models.items():
        model.fit(x_train,y_train)
        prediction = model.predict(x_test)
        score = f1_score(y_true=y_test,y_pred=prediction)
        accuracy = accuracy_score(y_true=y_test,y_pred=prediction)
        precision = precision_score(y_true=y_test,y_pred=prediction)
        recall = recall_score(y_true=y_test,y_pred=prediction)
        conf = confusion_matrix(y_true=y_test,y_pred=prediction)
        result[f'{name}'] = [score,accuracy,precision,recall]
    return result

In [52]:
regressions(models,x_train,y_train,x_test,y_test)

Unnamed: 0,LogisticRegression,RidgeClassifier,DecisionTreeClassifier,RandomForestClassifier,XGBClassifier
SCORE,0.857143,0.852941,0.839024,0.900474,0.869565
ACCURACY,0.842391,0.836957,0.820652,0.88587,0.853261
PRECISION,0.90625,0.896907,0.877551,0.913462,0.9
RECALL,0.813084,0.813084,0.803738,0.88785,0.841121


In [None]:
n_estimators = [100,200,300,500,1000]
max_depth = [None,2,5,10]
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,5]
max_features = ['auto','log2','sqrt']

params = {
    "max_features":max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'n_estimators':n_estimators
}



rf = RandomForestClassifier()
rf_grid = RandomizedSearchCV(estimator=rf,param_distributions=params,n_iter=100,cv=3,verbose=2)
rf_grid.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=300; total time=   0.3s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=300; total time=   0.3s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=300; total time=   0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.2s
[CV] END max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=300; total time=   0.3s
[CV] END max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimator