In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder


In [3]:
df=pd.read_csv('Heart_Disease_Prediction.csv')

In [104]:
df.dtypes

Age                          int64
Sex                          int64
Chest pain type              int64
BP                           int64
Cholesterol                  int64
FBS over 120                 int64
EKG results                  int64
Max HR                       int64
Exercise angina              int64
ST depression              float64
Slope of ST                  int64
Number of vessels fluro      int64
Thallium                     int64
Heart Disease               object
dtype: object

In [105]:
190/270

0.7037037037037037

In [9]:
df['Heart Disease'].value_counts()

Heart Disease
Absence     150
Presence    120
Name: count, dtype: int64

In [8]:
df.select_dtypes(['int64','float64']).corr()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
Age,1.0,-0.08477,0.092851,0.296292,0.285092,0.125882,0.144758,-0.348452,0.091004,0.205487,0.175391,0.35995,0.117915
Sex,-0.08477,1.0,0.034636,-0.062693,-0.201647,0.04214,0.039253,-0.076101,0.180022,0.097412,0.050545,0.08683,0.391046
Chest pain type,0.092851,0.034636,1.0,-0.043196,0.090465,-0.098537,0.074325,-0.317682,0.35316,0.167244,0.1369,0.22589,0.262659
BP,0.296292,-0.062693,-0.043196,1.0,0.173019,0.155681,0.116157,-0.039136,0.082793,0.2228,0.142472,0.085697,0.132045
Cholesterol,0.285092,-0.201647,0.090465,0.173019,1.0,0.025186,0.167652,-0.018739,0.078243,0.027709,-0.005755,0.126541,0.028836
FBS over 120,0.125882,0.04214,-0.098537,0.155681,0.025186,1.0,0.053499,0.022494,-0.004107,-0.025538,0.044076,0.123774,0.049237
EKG results,0.144758,0.039253,0.074325,0.116157,0.167652,0.053499,1.0,-0.074628,0.095098,0.120034,0.160614,0.114368,0.007337
Max HR,-0.348452,-0.076101,-0.317682,-0.039136,-0.018739,0.022494,-0.074628,1.0,-0.380719,-0.349045,-0.386847,-0.265333,-0.253397
Exercise angina,0.091004,0.180022,0.35316,0.082793,0.078243,-0.004107,0.095098,-0.380719,1.0,0.274672,0.255908,0.153347,0.321449
ST depression,0.205487,0.097412,0.167244,0.2228,0.027709,-0.025538,0.120034,-0.349045,0.274672,1.0,0.609712,0.255005,0.324333


In [43]:
# Defining the features and the target

X = df.drop(columns='Heart Disease')
y_no_encode = df['Heart Disease']

label_encoder = LabelEncoder()
y=label_encoder.fit_transform(y_no_encode)


In [116]:
# Train-Test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((216, 13), (54, 13), (216,), (54,))

In [46]:
df.isnull().sum()

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

In [18]:
df.columns

Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Heart Disease'],
      dtype='object')

In [47]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# Assuming num_transformer is defined, e.g., a StandardScaler
num_transformer = StandardScaler()
# Define the ColumnTransformer
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium'])
])

In [48]:
X_train_transformed = preprocessor.fit_transform(X_train)

print("Original training set")
display(X_train.head(3))

print("Preprocessed training set")
display(pd.DataFrame(X_train_transformed).head(3))

Original training set


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
148,58,1,3,132,224,0,2,173,0,3.2,1,2,7
93,54,1,4,110,206,0,2,108,1,0.0,2,1,3
82,58,1,4,150,270,0,2,111,1,0.8,1,0,7


Preprocessed training set


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.420539,0.685119,-0.183142,0.052168,-0.499285,-0.40161,0.995403,1.007659,-0.685119,1.812938,-0.916698,1.360844,1.204277
1,-0.01565,0.685119,0.857874,-1.169023,-0.883609,-0.40161,0.995403,-1.808892,1.459601,-0.896473,0.733359,0.322179,-0.864302
2,0.420539,0.685119,0.857874,1.051324,0.482876,-0.40161,0.995403,-1.678897,1.459601,-0.21912,-0.916698,-0.716487,1.204277


In [49]:
pd.DataFrame(
    X_train_transformed,
    columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,num_transformer__Age,num_transformer__Sex,num_transformer__Chest pain type,num_transformer__BP,num_transformer__Cholesterol,num_transformer__FBS over 120,num_transformer__EKG results,num_transformer__Max HR,num_transformer__Exercise angina,num_transformer__ST depression,num_transformer__Slope of ST,num_transformer__Number of vessels fluro,num_transformer__Thallium
0,0.420539,0.685119,-0.183142,0.052168,-0.499285,-0.40161,0.995403,1.007659,-0.685119,1.812938,-0.916698,1.360844,1.204277
1,-0.01565,0.685119,0.857874,-1.169023,-0.883609,-0.40161,0.995403,-1.808892,1.459601,-0.896473,0.733359,0.322179,-0.864302
2,0.420539,0.685119,0.857874,1.051324,0.482876,-0.40161,0.995403,-1.678897,1.459601,-0.21912,-0.916698,-0.716487,1.204277
3,-1.869453,-1.459601,-0.183142,-0.613936,-0.691447,-0.40161,-1.014008,0.877665,-0.685119,-0.896473,-0.916698,-0.716487,-0.864302
4,-0.778981,0.685119,0.857874,-1.058006,-0.926312,-0.40161,-1.014008,-0.292287,-0.685119,-0.811804,-0.916698,-0.716487,-0.864302


In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [89]:
pipeline = make_pipeline(preprocessor, LogisticRegression())
pipeline

In [93]:
# Train Pipeline
pipeline.fit(X_train,y_train)

# Make predictions
pipeline.predict(X_test.iloc[0:1])

# Score model
pipeline.score(X_test,y_test)

0.8888888888888888

In [92]:
from sklearn.model_selection import cross_val_score

# Cross-validate Pipeline
cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision').mean()

0.82493961352657

In [99]:
from sklearn.neighbors import KNeighborsClassifier as KNC


In [100]:
pipeline = make_pipeline(preprocessor, KNC())
pipeline.fit(X_train, y_train)
pipeline.score(X_test,y_test)

0.9259259259259259

In [101]:
cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision').mean()

0.7605889724310777

In [113]:
pipeline.get_params()


{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('num_transformer', StandardScaler(),
                                    ['Age', 'Sex', 'Chest pain type', 'BP',
                                     'Cholesterol', 'FBS over 120', 'EKG results',
                                     'Max HR', 'Exercise angina', 'ST depression',
                                     'Slope of ST', 'Number of vessels fluro',
                                     'Thallium'])])),
  ('kneighborsclassifier', KNeighborsClassifier())],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('num_transformer', StandardScaler(),
                                  ['Age', 'Sex', 'Chest pain type', 'BP',
                                   'Cholesterol', 'FBS over 120', 'EKG results',
                                   'Max HR', 'Exercise angina', 'ST depression',
                                   'Slope of ST', 'Number of vessels fluro',
                         

In [117]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    pipeline,
    param_grid={'kneighborsclassifier__n_neighbors':[1,2,3,4,5,6,7,8,9,10,11,12]
    },
    cv=5,
    scoring="precision")

grid_search.fit(X_train, y_train)

grid_search.best_params_

ValueError: Invalid parameter 'kneighborsclassifier' for estimator Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num_transformer',
                                                  StandardScaler(),
                                                  ['Age', 'Sex',
                                                   'Chest pain type', 'BP',
                                                   'Cholesterol',
                                                   'FBS over 120',
                                                   'EKG results', 'Max HR',
                                                   'Exercise angina',
                                                   'ST depression',
                                                   'Slope of ST',
                                                   'Number of vessels fluro',
                                                   'Thallium'])])),
                ('gaussiannb', GaussianNB())]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [121]:
from sklearn.naive_bayes import GaussianNB

pipeline = make_pipeline(preprocessor, GaussianNB())
pipeline.fit(X_train, y_train)
pipeline.score(X_test,y_test, scoring='precision')

pipeline.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num_transformer',
                                                  StandardScaler(),
                                                  ['Age', 'Sex',
                                                   'Chest pain type', 'BP',
                                                   'Cholesterol',
                                                   'FBS over 120',
                                                   'EKG results', 'Max HR',
                                                   'Exercise angina',
                                                   'ST depression',
                                                   'Slope of ST',
                                                   'Number of vessels fluro',
                                                   'Thallium'])])),
                ('gaussiannb', GaussianNB())])>