In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler

In [98]:
df = pd.read_csv(r"c:\Users\USER\Desktop\MyDatasets\heart.csv")
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


# Preparing the data.
1. First we replace categorical data with numeric values

In [99]:
df.Sex.replace(['F', 'M'], [0, 1], inplace=True)
df.ChestPainType.replace(['ATA', 'NAP', 'ASY', 'TA'], [0, 1, 2, 3], inplace=True)
df.RestingECG.replace(['Normal', 'ST', 'LVH'], [0, 1, 2], inplace=True)
df.ExerciseAngina.replace(['N', 'Y'], [0, 1], inplace=True)
df.ST_Slope.replace(['Up', 'Flat', 'Down'], [0, 1, 2], inplace=True)
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,0,140,289,0,0,172,0,0.0,0,0
1,49,0,1,160,180,0,0,156,0,1.0,1,1
2,37,1,0,130,283,0,1,98,0,0.0,0,0
3,48,0,2,138,214,0,0,108,1,1.5,1,1
4,54,1,1,150,195,0,0,122,0,0.0,0,0
5,39,1,1,120,339,0,0,170,0,0.0,0,0
6,45,0,0,130,237,0,0,170,0,0.0,0,0
7,54,1,0,110,208,0,0,142,0,0.0,0,0
8,37,1,2,140,207,0,0,130,1,1.5,1,1
9,48,0,0,120,284,0,0,120,0,0.0,0,0


2. We convert the dataframe to a numpy array **and** split the dataset into train and test segments 

In [113]:
data = df.to_numpy()

X = data[:, :11]
Y = data[:, 11]

X_train, X_test, Y_train, Y_test = tts(X, Y, test_size=0.2, random_state=1, shuffle=True)

3. We scale our data, as most machine learning models do better when the input data is scaled or normalised.

In [114]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Decision Tree Classifier

In [125]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train, Y_train)

accuracy = round(((clf.score(X_test, Y_test)) * 100 ))
print(f'{accuracy}%')

78%


### Let's try something cheeky with the K-Fold cross_validation
Perhaps we could improve the accuracy of the DecisionTreeClassifier

In [152]:
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=1)
cv_scores = cross_val_score(clf, X, Y, cv=26)
accuracy = round((cv_scores.mean()) * 100)
print(f'{accuracy}%')

79%


A little improvement. 
Generally, basic decision tree isn't so much of an efficient model choice.

### RandomForestClassifier

In [153]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=35, random_state=1)
clf.fit(X_train, Y_train)
accuracy = round((clf.score(X_test, Y_test) * 100))
print(f'{accuracy}%')

90%


***Wow***!!! what a jump. 11% improvement on accuracy.

Now we could go ahead and use the K-Fold cross validation; there might be room for  more improvement.

### SVM