## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

## Importing the dataset

In [2]:
dataset = pd.read_csv('heart_disease.csv')

## Cleaning the Data

In [3]:
#Accessing all the rows that contain missing data ('?'s)
missing_rows = pd.DataFrame(columns = dataset.columns)

for column in dataset.columns:
  #If column is a string type
  if (dataset[column].dtype == 'object'):
    missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])

missing_rows = missing_rows.drop_duplicates()

  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])
  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])
  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])
  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])
  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])
  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])
  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])
  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])
  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])
  missing_rows = missing_rows.append(dataset[dataset[column].str.contains('\?', na = False)])


In [4]:
print(missing_rows)

    Age   Sex      CP TrestBPS Chol   FBS RestECG Thalac Exang  Oldpeak Slope  \
143  52  male  notang      138  223   fal    norm    169   fal      0.0    up   
157  38  male  notang      138  175   fal    norm    173   fal      0.0    up   
165  43  male  asympt      132  247  TRUE     hyp    143  TRUE      0.1  flat   
250  58  male  abnang      125  220   fal    norm    144   fal      0.4  flat   
82   53   fem  notang      128  216   fal     hyp    115   fal      0.0    up   
198  52  male  asympt      128  204  TRUE    norm    156  TRUE      1.0  flat   

    Ca  Thal   Num Unnamed: 14  
143  ?  norm  buff           H  
157  ?  norm  buff           H  
165  ?   rev  sick          S1  
250  ?   rev  buff           H  
82   0     ?  buff           H  
198  0     ?  sick          S2  


In [5]:
#Finding the counts of the CA column to find most common value
Ca_count = dataset['Ca'].value_counts()
print(Ca_count)

0    175
1     65
2     38
3     20
?      5
Name: Ca, dtype: int64


In [6]:
#Replacing ? Values with 0 (most common)
dataset['Ca'] = dataset['Ca'].replace('?', '0')

In [7]:
#Finding the counts of Thal column to find most common value
Thal_count = dataset['Thal'].value_counts()
print(Thal_count)

norm    166
rev     117
fix      18
?         2
Name: Thal, dtype: int64


In [8]:
#Replacing ? Values with norm(most common)
dataset['Thal'] = dataset['Thal'].replace('?', 'norm')

## Making the Matrix of Features and Dependent Variable

In [9]:
X = dataset.iloc[:, :-2].values
y = dataset.iloc[:, -2].values

In [10]:
print(y)

['buff' 'sick' 'sick' 'buff' 'buff' 'buff' 'sick' 'buff' 'sick' 'sick'
 'buff' 'buff' 'sick' 'buff' 'buff' 'buff' 'buff' 'sick' 'sick' 'sick'
 'buff' 'buff' 'buff' 'buff' 'sick' 'buff' 'sick' 'sick' 'buff' 'buff'
 'buff' 'sick' 'sick' 'sick' 'buff' 'sick' 'buff' 'buff' 'buff' 'sick'
 'sick' 'buff' 'sick' 'buff' 'buff' 'buff' 'buff' 'sick' 'buff' 'sick'
 'sick' 'sick' 'sick' 'buff' 'buff' 'sick' 'buff' 'sick' 'buff' 'sick'
 'sick' 'sick' 'buff' 'sick' 'sick' 'buff' 'sick' 'sick' 'sick' 'sick'
 'buff' 'sick' 'buff' 'buff' 'sick' 'buff' 'buff' 'buff' 'sick' 'buff'
 'buff' 'buff' 'buff' 'buff' 'buff' 'buff' 'sick' 'buff' 'buff' 'buff'
 'sick' 'sick' 'sick' 'buff' 'buff' 'buff' 'sick' 'sick' 'sick' 'sick'
 'buff' 'sick' 'sick' 'buff' 'buff' 'buff' 'sick' 'sick' 'sick' 'sick'
 'buff' 'sick' 'buff' 'buff' 'buff' 'buff' 'sick' 'sick' 'sick' 'buff'
 'buff' 'sick' 'buff' 'sick' 'buff' 'sick' 'sick' 'buff' 'buff' 'buff'
 'buff' 'buff' 'buff' 'sick' 'sick' 'sick' 'sick' 'sick' 'sick' 'buff'
 'sick

In [11]:
print(X)

[[63 'male' 'angina' ... 'down' '0' 'fix']
 [67 'male' 'asympt' ... 'flat' '3' 'norm']
 [67 'male' 'asympt' ... 'flat' '2' 'rev']
 ...
 [49 'male' 'notang' ... 'up' '3' 'norm']
 [74 'fem' 'abnang' ... 'up' '1' 'norm']
 [54 'fem' 'notang' ... 'up' '1' 'norm']]


## Encoding the Dependent Variable 'Sick' (1) or 'Buff' (0)

In [12]:
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
print(y)

[0 1 1 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 1 1 0 1 0
 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 0 0
 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0 1 1 0 0 0 1 1 1 1 0
 1 0 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 0 0 1 0 1
 1 0 1 0 0 1 1 0 0 0 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 1
 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 1 1 1 1 0 0 0 0 0 1 0 1 1 1 1 0 0
 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 0 1 1 0 0 1 1 1 0 1 1 1
 1 1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1
 0 0 0 0 1 0 0]


One Hot Encoding the Categorical Independent Variables 'Gender', 'Chest Pain Type', 'Fasting Blood Sugar', 'resting electrocardiographic results', 'exercise induced angina', 'Slope', 'Thal'

In [14]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,2,5,6,8,10,12])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [15]:
print(X)

[[0.0 1.0 0.0 ... 150 2.3 '0']
 [0.0 1.0 0.0 ... 108 1.5 '3']
 [0.0 1.0 0.0 ... 129 2.6 '2']
 ...
 [0.0 1.0 0.0 ... 126 0.8 '3']
 [1.0 0.0 1.0 ... 121 0.2 '1']
 [1.0 0.0 0.0 ... 163 0.0 '1']]


## Splitting Data into Training and Testing Set

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [17]:
print(X_train)

[[0.0 1.0 0.0 ... 111 0.8 '0']
 [0.0 1.0 0.0 ... 166 0.5 '0']
 [0.0 1.0 0.0 ... 147 3.6 '0']
 ...
 [0.0 1.0 1.0 ... 103 1.4 '1']
 [0.0 1.0 0.0 ... 153 0.0 '1']
 [0.0 1.0 0.0 ... 125 0.0 '0']]


In [18]:
len(X_test)

61

## Feature Scaling

In [19]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the Logistic Regression model on the Training set

In [20]:
classifier = LogisticRegression(penalty = 'l2', C = 0.5, solver = 'lbfgs', random_state = 0)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [21]:
print(y_test)

[0 0 0 0 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0
 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0]


In [22]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]]


## Making a single prediction

In [23]:
X_test[22]

array([ 1.50674161, -1.50674161, -0.46484449, -0.29189346, -0.93593215,
        1.61614984, -0.41119597,  0.41119597, -0.09128709, -1.01666781,
        1.03362279, -0.68964466,  0.68964466, -0.26607604,  1.10461916,
       -0.96747093, -0.26607604,  0.91287093, -0.79695366, -1.33350242,
       -0.69510349, -0.73018809,  1.00684639, -0.91231127, -0.6945881 ])

In [24]:
y_test[22]

0

In [25]:
print(classifier.predict([[ 1.50674161, -1.50674161, -0.46484449, -0.29189346, -0.93593215,
        1.61614984, -0.41119597,  0.41119597, -0.09128709, -1.01666781,
        1.03362279, -0.68964466,  0.68964466, -0.26607604,  1.10461916,
       -0.96747093, -0.26607604,  0.91287093, -0.79695366, -1.33350242,
       -0.69510349, -0.73018809,  1.00684639, -0.91231127, -0.6945881]]))

[0]


## Making the Confusion Matrix

In [26]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[31  3]
 [ 2 25]]


0.9180327868852459

## Computing the accuracy with k-Fold Cross Validation

In [27]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 82.68 %
Standard Deviation: 5.96 %
