<a href="https://colab.research.google.com/github/isabellecagorol/HTML-CSS-Responsive/blob/master/F_KNeighbors_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer

**Loading Data set**

In [None]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [None]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [None]:
#We drop id columns since it's a unique identifier number
df.drop(columns='id', inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [None]:
df.shape

(5110, 12)

Handling Missing Values

In [None]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

We have 201 samples with missing values in the BMI column, which is about 4% of the data. To improve our results, we are choosing to remove these samples.

In [None]:
#Missing values in BMI columns is about 4% , we drop them.
df.dropna(how='any', inplace=True)

In [None]:
cols = df.columns[:-1]
cols

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'],
      dtype='object')

In [None]:
numeric_columns = ['age', 'bmi', 'avg_glucose_level']
categorical_columns = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke']

We count number of unique values in each categorical column, to change them with integer values. Here we use .unique() command.


In [None]:
columns_temp = ['gender', 'ever_married', 'work_type', 'smoking_status', 'Residence_type']

for col in columns_temp :
    print('column :', col)
    for index, unique in enumerate(df[col].unique()) :
        print(unique, ':', index)
    print('_'*45)

column : gender
Male : 0
Female : 1
Other : 2
_____________________________________________
column : ever_married
Yes : 0
No : 1
_____________________________________________
column : work_type
Private : 0
Self-employed : 1
Govt_job : 2
children : 3
Never_worked : 4
_____________________________________________
column : smoking_status
formerly smoked : 0
never smoked : 1
smokes : 2
Unknown : 3
_____________________________________________
column : Residence_type
Urban : 0
Rural : 1
_____________________________________________


In [None]:
# gender
data_2 = df.replace(
    {'gender' : {'Male' : 0, 'Female' : 1, 'Other' : 2}}
)

# ever_married
data_2 =  data_2.replace(
    {'ever_married' : {'Yes' : 0, 'No' : 1}}
)

# work_type
data_2 =  data_2.replace(
    {'work_type' : {'Private' : 0, 'Self-employed' : 1, 'Govt_job' : 2, 'children' : 3, 'Never_worked' : 4}}
)

# smoking_status
data_2 =  data_2.replace(
    {'smoking_status' : {'formerly smoked' : 0, 'never smoked' : 1, 'smokes' : 2, 'Unknown' : 3}}
)

# Residence_type
data_2 =  data_2.replace(
    {'Residence_type' : {'Urban' : 0, 'Rural' : 1}}
)

In [None]:
data_2.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,0,67.0,0,1,0,0,0,228.69,36.6,0,1
2,31112,0,80.0,0,1,0,0,1,105.92,32.5,1,1
3,60182,1,49.0,0,0,0,0,0,171.23,34.4,2,1
4,1665,1,79.0,1,0,0,1,1,174.12,24.0,1,1
5,56669,0,81.0,0,0,0,0,0,186.21,29.0,0,1


Define features (X) and the target variable (y)

In [None]:
X_temp = data_2.drop(columns='stroke')
y = data_2.stroke

In [None]:
scaler = MinMaxScaler().fit_transform(X_temp)
X = pd.DataFrame(scaler, columns=X_temp.columns)
X.describe()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
count,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0
mean,0.507628,0.295274,0.522282,0.091872,0.049501,0.347321,0.210634,0.492768,0.231674,0.212981,0.527942
std,0.288145,0.246098,0.275331,0.288875,0.216934,0.476167,0.281014,0.499999,0.20508,0.089966,0.363382
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.254285,0.0,0.304199,0.0,0.0,0.0,0.0,0.0,0.10133,0.151203,0.333333
50%,0.51509,0.5,0.536133,0.0,0.0,0.0,0.0,0.0,0.168775,0.203895,0.333333
75%,0.756804,0.5,0.731445,0.0,0.0,1.0,0.5,1.0,0.269827,0.261168,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Splitting X and y to train & test dataset. 70/30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

Fixing Imbalanced dataset using SMOTE

In [None]:
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.7)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X, y = pipeline.fit_resample(X, y)

In [None]:
print(f"ٌRecords Duplicates: {df.duplicated().sum()}")
print(f"No. of records with stroke: {y.value_counts()[1]}")
print(f"No. of records without stroke: {y.value_counts()[0]}")

ٌRecords Duplicates: 0
No. of records with stroke: 1410
No. of records without stroke: 2014


KNeighbors Classifier

In [None]:
# Create a random forest classifier object
knn = KNeighborsClassifier(n_neighbors = 1)

In [None]:
# Train the model
knn.fit(X_train, y_train)

In [None]:
# Predict on the test data
y_pred_knn = knn.predict(X_test)

In [None]:
# Calculate accuracy
accuracy_using_knn = round(accuracy_score(y_test, y_pred_knn)*100, 2)
print("Model accuracy using KNN classifier: ", accuracy_using_knn, "%")

Model accuracy using KNN classifier:  92.67 %


Evaluate using 10 fold cross validation

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict

#Perform 10-fold cross-validation
cv_scores_knn = cross_val_score(knn, X, y, cv=10)

#Print the cross-validation scores
print("Cross-validation scores:", cv_scores_knn)

Cross-validation scores: [0.93877551 0.88338192 0.90379009 0.9212828  0.90350877 0.91520468
 0.90643275 0.9122807  0.93567251 0.90935673]


In [None]:
# Make predictions using cross-validated models
y_pred_cv = cross_val_predict(knn, X, y, cv=10)

# Calculate performance metrics
accuracy = accuracy_score(y, y_pred_cv)
precision = precision_score(y, y_pred_cv)
recall = recall_score(y, y_pred_cv)
conf_matrix = confusion_matrix(y, y_pred_cv)
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
f1 = f1_score(y, y_pred_cv)

In [None]:
from sklearn import metrics

# Print the results
print("Cross-Validation Mean Accuracy:", round(cv_scores_knn.mean()*100,3),'%')
print("Overall Accuracy:", round(accuracy*100,3),'%')
print("Precision:", round(precision*100,3),'%')
print("Recall:", round(recall*100,3),'%')
print("Specificity:", round(specificity*100,3),'%')
print("Sensitivity:", round(sensitivity*100,3),'%')
print("F1 Score:", round(f1*100,3),'%')

# Print the confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)

cr = metrics.classification_report(y_test, y_pred_knn)
print(cr)

Cross-Validation Mean Accuracy: 91.297 %
Overall Accuracy: 91.998 %
Precision: 85.236 %
Recall: 97.447 %
Specificity: 88.183 %
Sensitivity: 97.447 %
F1 Score: 90.933 %

Confusion Matrix:
[[1776  238]
 [  36 1374]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1418
           1       0.12      0.15      0.13        55

    accuracy                           0.93      1473
   macro avg       0.54      0.55      0.55      1473
weighted avg       0.93      0.93      0.93      1473

