In [1]:
# Import all required liberaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import dataset in dataframe
df=pd.read_csv('heart.csv')

In [3]:
# look on variables
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
#Checking for  Null Values 
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [5]:
# Dependent variables  counts
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [6]:
# cross check for duplicates
df.duplicated().any()

True

In [7]:
df.shape

(303, 14)

In [8]:
#  drop duplicates and cross check for it
df.drop_duplicates(inplace=True)
df.shape

In [10]:
# Check Types of data
df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [11]:
# create dummies variables for Categorical Variables 
pd.get_dummies(df,columns=['cp','fbs','restecg','exang','slope','ca','thal'],drop_first=True)

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,63,1,145,233,150,2.3,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,37,1,130,250,187,3.5,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,41,0,130,204,172,1.4,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0
3,56,1,120,236,178,0.8,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,57,0,120,354,163,0.6,1,0,0,0,...,1,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,123,0.2,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
299,45,1,110,264,132,1.2,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
300,68,1,144,193,141,3.4,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
301,57,1,130,131,115,1.2,0,0,0,0,...,1,1,0,1,0,0,0,0,0,1


In [12]:
# Select all Independent variables and dependent variables seperately
X=df.drop(['target'],axis=1)
y=df['target']

In [13]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [14]:
# scale down Indpendent Features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train,['age','trestbps','chol','thalach','oldpeak'])
X_test = sc.transform(X_test,['age','trestbps','chol','thalach','oldpeak'])

In [15]:
# use Logistic Regression Classifier for Binomial Classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2','l1']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.857792 using {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}


In [16]:
# take predicion
Y_pred = grid_result.predict(X_test)

In [17]:
# draw Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, Y_pred)
print(cm)

In [18]:
# Check Accuracy Score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, Y_pred)

0.8241758241758241

[[33 12]
 [ 4 42]]


In [20]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.89      0.73      0.80        45
           1       0.78      0.91      0.84        46

    accuracy                           0.82        91
   macro avg       0.83      0.82      0.82        91
weighted avg       0.83      0.82      0.82        91



In [21]:
import pickle
file = open('HeartPrediction.pkl', 'wb')
pickle.dump(grid_result, file)