# Sci-kit learn 

In [1]:
# import the packages that we'd work with.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import neighbors
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [2]:
# Load the data into a pandas dataframe
df = pd.read_csv(r"c:\Users\USER\Desktop\MyDatasets\heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# Preparing the data.
1. First we replace categorical data with numeric values

In [15]:
df.Sex.replace(['F', 'M'], [0, 1], inplace=True)
df.ChestPainType.replace(['ATA', 'NAP', 'ASY', 'TA'], [0, 1, 2, 3], inplace=True)
df.RestingECG.replace(['Normal', 'ST', 'LVH'], [0, 1, 2], inplace=True)
df.ExerciseAngina.replace(['N', 'Y'], [0, 1], inplace=True)
df.ST_Slope.replace(['Up', 'Flat', 'Down'], [0, 1, 2], inplace=True)
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,0,140,289,0,0,172,0,0.0,0,0
1,49,0,1,160,180,0,0,156,0,1.0,1,1
2,37,1,0,130,283,0,1,98,0,0.0,0,0
3,48,0,2,138,214,0,0,108,1,1.5,1,1
4,54,1,1,150,195,0,0,122,0,0.0,0,0


2. We convert the dataframe to a numpy array **and** split the dataset into train and test segments 

In [16]:
data = df.to_numpy()

X = data[:, :11]
Y = data[:, 11]

X_train, X_test, Y_train, Y_test = tts(X, Y, test_size=0.2, random_state=1, shuffle=True)

3. We scale our data, as most machine learning models do better when the input data is scaled or normalised.

In [17]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

### Decision Tree Classifier

In [39]:
dtc = DecisionTreeClassifier(random_state=1)
dtc.fit(X_train_scaled, Y_train)
accuracy = round(((dtc.score(X_test_scaled, Y_test)) * 100 ))
print(f'Accuracy of model: {accuracy}%')

Accuracy of model: 78%


### Let's try something cheeky with the K-Fold cross_validation
Perhaps we could improve the accuracy of the DecisionTreeClassifier

In [19]:
dtc = DecisionTreeClassifier(random_state=1)
cv_scores = cross_val_score(dtc, X, Y, cv=26)
accuracy = round((cv_scores.mean()) * 100)
print(f'Accuracy of model: {accuracy}%')

Accuracy of model: 79%


A little improvement. 
Generally, basic decision tree isn't so much of an efficient model choice.

### RandomForestClassifier

In [44]:
rfc = RandomForestClassifier(n_estimators=35, random_state=1)
rfc.fit(X_train_scaled, Y_train)
accuracy = round((rfc.score(X_test_scaled, Y_test) * 100))
print(f'Accuracy of model: {accuracy}%')

Accuracy of model: 90%


***Wow***!!! what a jump. 11% improvement on accuracy.

### SVM

In [47]:
svc = svm.SVC(kernel='rbf', C=1.0)
svc.fit(X_train_scaled, Y_train)
accuracy = round((svc.score(X_test_scaled, Y_test))*100)
print(f'Accuracy of model: {accuracy}%')

Accuracy of model: 91%


The SVM model has upped accuracy by 1%, impressive.

### KNN (K-Nearest Neighbours)

In [50]:
knn = neighbors.KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train_scaled, Y_train)
accuracy = round((knn.score(X_test_scaled, Y_test)) * 100)
print(f'Accuracy of model: {accuracy}%')

Accuracy of model: 90%


When working with **KNNs**, choosing the K value is a bit tricky, so it's best to create a loop that train the model over different values of K. <br>
Then we can go ahead to  select the K value with the highest accuracy.

In [23]:
for i in range(1, 51):
    knn = neighbors.KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_scaled, Y_train)
    accuracy = round((knn.score(X_test_scaled, Y_test))*100)
    print(f'Accuracy of model using K value of {i}: {accuracy}%')

Accuracy of model using K value of 1: 83%
Accuracy of model using K value of 2: 80%
Accuracy of model using K value of 3: 89%
Accuracy of model using K value of 4: 88%
Accuracy of model using K value of 5: 89%
Accuracy of model using K value of 6: 89%
Accuracy of model using K value of 7: 89%
Accuracy of model using K value of 8: 89%
Accuracy of model using K value of 9: 90%
Accuracy of model using K value of 10: 90%
Accuracy of model using K value of 11: 90%
Accuracy of model using K value of 12: 91%
Accuracy of model using K value of 13: 91%
Accuracy of model using K value of 14: 90%
Accuracy of model using K value of 15: 90%
Accuracy of model using K value of 16: 90%
Accuracy of model using K value of 17: 91%
Accuracy of model using K value of 18: 90%
Accuracy of model using K value of 19: 90%
Accuracy of model using K value of 20: 89%
Accuracy of model using K value of 21: 90%
Accuracy of model using K value of 22: 90%
Accuracy of model using K value of 23: 91%
Accuracy of model us

So we can see that some K values give a better result, now we can select the K value with the highest accuracy. <br>
(K of 12, 13, 17, 23, 27-32)

### Naive Bayes
It's important to note that the MultinomialNB model for sklearn doesn't take negative feature values, as regarding our earlier scaled features. <br>
So we'll have to use the MinMaxScaler instead of the StandardScaler

In [53]:
# scale the input features
scaler = MinMaxScaler()
X_train_minmax = scaler.fit_transform(X_train)
X_test_minmax = scaler.transform(X_test)

# define the classifier
nb = MultinomialNB()
nb.fit(X_train_minmax, Y_train)
accuracy = round((nb.score(X_test_minmax, Y_test)) * 100)
print(f'Accuracy of the model: {accuracy}%')

Accuracy of the model: 89%


Naive Bayes is 2 points back from the accuracy score to beat. <br>
But the NB model did alright.

### LogisticRegression
We've seen the performance of all these fancy models.<br>
But this is a basic binary classification so, let's try the LogisticRegression model which is a rather simpler approach to this type of classification

In [55]:
lr = LogisticRegression()
lr.fit(X_train_scaled, Y_train)
accuracy = round((lr.score(X_test_scaled, Y_test))*100)
print(f'Accuracy of the model: {accuracy}%')

Accuracy of the model: 88%


I've made up two samples of data,<br>
the first sample is heart disease positive patient and <br>
the second is a heart disease negative patient.<br>
Let's see how these several models do in predicting the fact.

In [57]:
# I standardised the data prior to.
patient_data = np.array([[-0.5789,  0.5220,  0.6349, -0.9879, -1.7750,  1.8279, -0.7600, -0.3676,
         -0.8088, -0.8119,  0.6583], [-1.3316,  0.5216,  0.6352, -1.1563,  0.4743, -0.5473, -0.7599,  0.1890,
         -0.8082, -0.8111, -1.0494]])

print(f'**Decision Tree Classifier**')
for i in range(len(patient_data)):
    data = patient_data[i].reshape(1, -1)
    predicted = dtc.predict(data)
    if predicted==1:
        print(f'Patient {i+1} has a heart disease')
    else:
        print(f'Patient {i+1} does not have a heart disease')
        
print(f'\n**Random Forest Classifier**')
for i in range(len(patient_data)):
    data = patient_data[i].reshape(1, -1)
    predicted = rfc.predict(data)
    if predicted==1:
        print(f'Patient {i+1} has a heart disease')
    else:
        print(f'Patient {i+1} does not have a heart disease')
        
print(f'\n**SVM**')
for i in range(len(patient_data)):
    data = patient_data[i].reshape(1, -1)
    predicted = svc.predict(data)
    if predicted==1:
        print(f'Patient {i+1} has a heart disease')
    else:
        print(f'Patient {i+1} does not have a heart disease')
        
print(f'\n**KNN**')
for i in range(len(patient_data)):
    data = patient_data[i].reshape(1, -1)
    predicted = knn.predict(data)
    if predicted==1:
        print(f'Patient {i+1} has a heart disease')
    else:
        print(f'Patient {i+1} does not have a heart disease')
        
print(f'\n**Naive Bayes**')
for i in range(len(patient_data)):
    data = patient_data[i].reshape(1, -1)
    predicted = nb.predict(data)
    if predicted==1:
        print(f'Patient {i+1} has a heart disease')
    else:
        print(f'Patient {i+1} does not have a heart disease')
        
print(f'\n**Logistic Regression**')
for i in range(len(patient_data)):
    data = patient_data[i].reshape(1, -1)
    predicted = lr.predict(data)
    if predicted==1:
        print(f'Patient {i+1} has a heart disease')
    else:
        print(f'Patient {i+1} does not have a heart disease')

**Decision Tree Classifier**
Patient 1 has a heart disease
Patient 2 does not have a heart disease

**Random Forest Classifier**
Patient 1 has a heart disease
Patient 2 does not have a heart disease

**SVM**
Patient 1 has a heart disease
Patient 2 does not have a heart disease

**KNN**
Patient 1 has a heart disease
Patient 2 does not have a heart disease

**Naive Bayes**
Patient 1 has a heart disease
Patient 2 does not have a heart disease

**Logistic Regression**
Patient 1 has a heart disease
Patient 2 does not have a heart disease


All the models did well in predicting the two samples correctly, even the <br>
Decision Tree Classifier with 78% accuracy, still managed to come through.

In [None]:
# ifunanyaScript