In [34]:
#importing different libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")   # ignore if any warnings are there

Data Preparation Source: The dataset is publically available on the Kaggle website, and it is from an ongoing cardiovascular study on residents of the town of Framingham, Massachusetts. The classification goal is to predict whether the patient has 10-year risk of future coronary heart disease (CHD).The dataset provides the patients’ information. It includes over 4,000 records and 15 attributes

In [35]:
#Importing the dataset and dropping the unwanted columns 
heart_df=pd.read_csv(r"C:\Jupyter\DSML\DSML\Logistic Regression\framingham.csv")
heart_df.drop(['education'],axis=1,inplace=True)
heart_df=pd.DataFrame(heart_df)

In [36]:
#Renaming the column name
heart_df.rename(columns={'male':'Sex_male'},inplace=True)
heart_df.head(4)

Unnamed: 0,Sex_male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1


In [37]:
heart_df.shape

(4238, 15)

In [38]:
#Check for missing values
(heart_df.isnull().sum()/len(heart_df)*100).sort_values(ascending=False)

glucose            9.155262
BPMeds             1.250590
totChol            1.179802
cigsPerDay         0.684285
BMI                0.448325
heartRate          0.023596
TenYearCHD         0.000000
diaBP              0.000000
sysBP              0.000000
diabetes           0.000000
prevalentHyp       0.000000
prevalentStroke    0.000000
currentSmoker      0.000000
age                0.000000
Sex_male           0.000000
dtype: float64

In [39]:
#fill na with mean for BPMeds , totChol ,cigsPerDay,BMI  and heartRate  column.
heart_df.BPMeds.fillna(heart_df.BPMeds.mean(),inplace=True)
heart_df.totChol.fillna(heart_df.totChol.mean(),inplace=True)
heart_df.cigsPerDay.fillna(heart_df.cigsPerDay.mean(),inplace=True)
heart_df.BMI.fillna(heart_df.BMI.mean(),inplace=True)
heart_df.heartRate.fillna(heart_df.heartRate.mean(),inplace=True)
(heart_df.isnull().sum()/len(heart_df)*100).sort_values(ascending=False)

glucose            9.155262
TenYearCHD         0.000000
heartRate          0.000000
BMI                0.000000
diaBP              0.000000
sysBP              0.000000
totChol            0.000000
diabetes           0.000000
prevalentHyp       0.000000
prevalentStroke    0.000000
BPMeds             0.000000
cigsPerDay         0.000000
currentSmoker      0.000000
age                0.000000
Sex_male           0.000000
dtype: float64

In [40]:
heart_df = heart_df.fillna(method='ffill')
(heart_df.isnull().sum()/len(heart_df)*100).sort_values(ascending=False)

TenYearCHD         0.0
glucose            0.0
heartRate          0.0
BMI                0.0
diaBP              0.0
sysBP              0.0
totChol            0.0
diabetes           0.0
prevalentHyp       0.0
prevalentStroke    0.0
BPMeds             0.0
cigsPerDay         0.0
currentSmoker      0.0
age                0.0
Sex_male           0.0
dtype: float64

In [41]:
heart_df.dropna(axis=0,inplace=True)
# Create correlation matrix
corr_matrix = heart_df.corrwith(heart_df.TenYearCHD).abs()
corr_matrix

Sex_male           0.088428
age                0.225256
currentSmoker      0.019456
cigsPerDay         0.057775
BPMeds             0.086774
prevalentStroke    0.061810
prevalentHyp       0.177603
diabetes           0.097317
totChol            0.081624
sysBP              0.216429
diaBP              0.145299
BMI                0.074680
heartRate          0.022898
glucose            0.113890
TenYearCHD         1.000000
dtype: float64

In [42]:
x=heart_df.drop('TenYearCHD',axis=1)
y=heart_df.TenYearCHD
#spiltting the dataset into train and test dataset
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=101)

In [43]:
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [44]:
#predecting the value of y_pred
y_pred = model.predict(x_test)
#checking the accuracy score
accuracy_score(y_pred,y_test)

0.858470335954253

In [45]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.86      0.92      1379
           1       0.05      0.50      0.09        20

   micro avg       0.86      0.86      0.86      1399
   macro avg       0.52      0.68      0.51      1399
weighted avg       0.98      0.86      0.91      1399



In [46]:
print(confusion_matrix(y_pred,y_test))

[[1191  188]
 [  10   10]]
