# Machine Learning | Project 1 | 2021/22

- João Santos, 76912
- João Carvalho, 106310

## Dataset

https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?resource=download

## Useful code

https://www.kaggle.com/code/kaanboke/beginner-friendly-end-to-end-ml-project-enjoy

## Imports and Data Load

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

import plotly 
import plotly.express as px

from sklearn.metrics import mutual_info_score

In [2]:
data=pd.read_csv('healthcare-dataset-stroke-data.csv')
data.drop('id',axis=1,inplace=True) # removes the ID column

print(data.shape)
data.head(5)

(5110, 11)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Data Analysis

In [3]:
# Define some classes
categorical = ['gender','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status']

numerical = ['age','avg_glucose_level', 'bmi']

In [4]:
# Check stroke ratios
stroke = data['stroke']
had_stroke = stroke.value_counts(normalize=True)

print(f'No Stoke: {had_stroke[0]*100:.2f}%')
print(f'Stoke: {had_stroke[1]*100:.2f}%')

No Stoke: 95.13%
Stoke: 4.87%


In [5]:
fig = px.histogram(data, x="stroke", title='Stroke', width=400, height=400)
fig.show()

These ratios shows us that we have an unbalanced dataset.

In [6]:
# Check for missing data
def missing (df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['MissingAmount', 'MissingRatio'])
    return missing_values

missing(data)

# TODO deal with missing data

Unnamed: 0,MissingAmount,MissingRatio
bmi,201,0.039335
gender,0,0.0
age,0,0.0
hypertension,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0
avg_glucose_level,0,0.0
smoking_status,0,0.0


In [7]:
# Get description of numerical data
data[numerical].describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,5110.0,5110.0,4909.0
mean,43.226614,106.147677,28.893237
std,22.612647,45.28356,7.854067
min,0.08,55.12,10.3
25%,25.0,77.245,23.5
50%,45.0,91.885,28.1
75%,61.0,114.09,33.1
max,82.0,271.74,97.6


In [8]:
# Get skewness
data[numerical].skew()

age                 -0.137059
avg_glucose_level    1.572284
bmi                  1.055340
dtype: float64

In [9]:
# TODO: x can be anything... should actually be everything
cat = 'gender'

fig = px.histogram(data, x=cat, title=cat, color="stroke")
fig.show()

## Metric of importance

In [10]:
def cat_mut_inf(series):
    return mutual_info_score(series, data['stroke']) 

df_cat = data[categorical].apply(cat_mut_inf) 
df_cat = df_cat.sort_values(ascending=False).to_frame(name='mutual_info_score') 
df_cat

Unnamed: 0,mutual_info_score
ever_married,0.00695
work_type,0.006825
hypertension,0.005976
heart_disease,0.005897
smoking_status,0.00275
Residence_type,0.00012
gender,5.1e-05


In [11]:
data[numerical].corr()

Unnamed: 0,age,avg_glucose_level,bmi
age,1.0,0.238171,0.333398
avg_glucose_level,0.238171,1.0,0.175502
bmi,0.333398,0.175502,1.0


In [12]:
data.groupby('stroke')[numerical].mean()

Unnamed: 0_level_0,age,avg_glucose_level,bmi
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,41.971545,104.795513,28.823064
1,67.728193,132.544739,30.471292


In [13]:
data[['age','avg_glucose_level','bmi','stroke']].corr()

Unnamed: 0,age,avg_glucose_level,bmi,stroke
age,1.0,0.238171,0.333398,0.245257
avg_glucose_level,0.238171,1.0,0.175502,0.131945
bmi,0.333398,0.175502,1.0,0.042374
stroke,0.245257,0.131945,0.042374,1.0


### Scatter plots

In [16]:
x='age'
y='avg_glucose_level'

fig = px.scatter(data, x=x, y=y, title=f'{x} and {y}',color='stroke', hover_data = data[['stroke']])
fig.show()