# Machine Learning | Project 1 | 2021/22

- João Santos, 76912
- João Carvalho, 106310

[Dataset](https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?resource=download)

Useful code:
- https://www.kaggle.com/code/noahdegunst/stroke-analysis

## Model Implementation

## Imports and Data Load

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

## Cleaning the data

In [3]:
# remove 'id' column
data = data.drop(columns=['id'])

# convert age to integer
data['age'] = data['age'].astype(np.int64)

# make the unknown BMI's equal the mean of the respective genders
male_data = data[data['gender'] == 'Male']
female_data = data[data['gender'] == 'Female']

male_data['bmi'].fillna(male_data['bmi'].mean(), inplace=True)
female_data['bmi'].fillna(female_data['bmi'].mean(), inplace=True)

data = pd.concat([male_data, female_data])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Shuffle the data

In [4]:
data = data.sample(frac=1).reset_index(drop=True)

In [5]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,41,0,0,No,Govt_job,Rural,74.81,39.7,smokes,0
1,Male,47,0,0,Yes,Private,Rural,75.3,25.0,formerly smoked,0
2,Female,37,0,0,Yes,Private,Urban,247.87,42.6,never smoked,0
3,Female,71,1,1,Yes,Private,Rural,221.24,24.2,Unknown,0
4,Female,48,0,0,Yes,Private,Rural,134.59,28.2,smokes,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5109 non-null   object 
 1   age                5109 non-null   int64  
 2   hypertension       5109 non-null   int64  
 3   heart_disease      5109 non-null   int64  
 4   ever_married       5109 non-null   object 
 5   work_type          5109 non-null   object 
 6   Residence_type     5109 non-null   object 
 7   avg_glucose_level  5109 non-null   float64
 8   bmi                5109 non-null   float64
 9   smoking_status     5109 non-null   object 
 10  stroke             5109 non-null   int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 439.2+ KB


## Preparing the data

Remove all *object* data types

In [7]:
from sklearn.preprocessing import LabelEncoder

# encode categorical variables
for col in ['gender','age', 'ever_married', 'Residence_type']:
    LE = LabelEncoder()
    data[col] = LE.fit_transform(data[col])

data = pd.get_dummies(data, columns=['work_type', 'smoking_status'], prefix=['work_type', 'smoking_status'])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5109 entries, 0 to 5108
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender                          5109 non-null   int64  
 1   age                             5109 non-null   int64  
 2   hypertension                    5109 non-null   int64  
 3   heart_disease                   5109 non-null   int64  
 4   ever_married                    5109 non-null   int64  
 5   Residence_type                  5109 non-null   int64  
 6   avg_glucose_level               5109 non-null   float64
 7   bmi                             5109 non-null   float64
 8   stroke                          5109 non-null   int64  
 9   work_type_Govt_job              5109 non-null   uint8  
 10  work_type_Never_worked          5109 non-null   uint8  
 11  work_type_Private               5109 non-null   uint8  
 12  work_type_Self-employed         51

Normalization

In [8]:
for col in ['age', 'avg_glucose_level', 'bmi']:
    data[col] = data[col]/data[col].max()

Correlation

In [9]:
fig = px.imshow(data[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'Residence_type', 'avg_glucose_level', 'bmi', 'stroke']].corr(), color_continuous_scale='Blues')

fig.update_layout(height=800)
fig.show()

Separate the data that has already been shuffled

In [11]:
values = data.drop(columns = ['stroke'])
labels = data['stroke']

train_data = data[:int(len(data)*0.8)]
test_data = data[int(len(data)*0.8):]

Since there are way more patients with stroke than without, let's sample the train data so there are equal amounts

In [12]:
train_0 = train_data[train_data['stroke'] == 0]
train_1 = train_data[train_data['stroke'] == 1]

train_1 = train_1.sample(len(train_0), replace=True)

train_data = pd.concat([train_0, train_1], axis = 0)

Now drop bmi and gender from the data since they are not very influential on the stroke attribute

In [17]:
x_train = np.array(train_data.drop(columns=['bmi', 'gender', 'stroke']))
y_train = np.array(train_data['stroke'])

x_test = np.array(test_data.drop(columns=['bmi', 'gender', 'stroke']))
y_test = np.array(test_data['stroke'])

## Models

In [18]:
from sklearn.metrics import *

Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier


Rclf= RandomForestClassifier()
Rclf.fit(x_train, y_train)

score = Rclf.predict(x_test)

print(classification_report(y_test, score))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       968
           1       0.17      0.07      0.10        54

    accuracy                           0.93      1022
   macro avg       0.56      0.53      0.53      1022
weighted avg       0.91      0.93      0.92      1022



Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000)
log_clf.fit(x_train,y_train)

score = log_clf.predict(x_test)
print(classification_report(y_test,score))

              precision    recall  f1-score   support

           0       0.99      0.71      0.83       968
           1       0.14      0.83      0.24        54

    accuracy                           0.72      1022
   macro avg       0.56      0.77      0.53      1022
weighted avg       0.94      0.72      0.79      1022



K Neighbors Classifier

In [21]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=4)
knn_clf.fit(x_train, y_train)

score = knn_clf.predict(x_test)
print(classification_report(y_test, score))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94       968
           1       0.14      0.22      0.17        54

    accuracy                           0.89      1022
   macro avg       0.55      0.57      0.56      1022
weighted avg       0.91      0.89      0.90      1022



SVC

In [22]:
from sklearn.svm import SVC

svm_clf = SVC(gamma='auto')

svm_clf.fit(x_train, y_train)

score = svm_clf.predict(x_test)

print(classification_report(y_test,score))

              precision    recall  f1-score   support

           0       0.99      0.69      0.82       968
           1       0.14      0.87      0.24        54

    accuracy                           0.70      1022
   macro avg       0.56      0.78      0.53      1022
weighted avg       0.94      0.70      0.78      1022

