In [34]:
import pandas as pd

In [35]:
# Load CSV data into DataFrame
stroke_df = pd.read_csv('healthcare_dataset_stroke_data.csv')
stroke_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# Duplicates
There are no repeating ID values no duplicate patient result need to be removed

In [36]:
# Check that no patients are repeated
print("# of unique IDs: ", stroke_df["id"].nunique())
print("# of rows: ", stroke_df.shape[0])

# of unique IDs:  5110
# of rows:  5110


# Categorical Variables
## Gender
Most gender values are either 'Male' or 'female', however there is one sample with a gender value of 'Other'.  A single sample is not sufficient for training a robust model, so I will remove the single 'Other' row.

## Smoking Status
About 30% of the patient records have an 'Unknown' smoking status.  This is too large a portion of the data to simply remove, so I will keep the 'Unkown' category as is for now

## Stroke
This is the value we are trying to predict.  There is a high imbalance of the data.  Only a small percentage of the samples (~5%) have a positive stroke status.

In [37]:
# Display count values for all categorical columns
for column in stroke_df:
    if column == 'id':
        continue
    if stroke_df[column].dtype == 'int64' or stroke_df[column].dtype == 'object':
        print(stroke_df[column].value_counts())
        print('\n')

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64


0    4612
1     498
Name: hypertension, dtype: int64


0    4834
1     276
Name: heart_disease, dtype: int64


Yes    3353
No     1757
Name: ever_married, dtype: int64


Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64


Urban    2596
Rural    2514
Name: Residence_type, dtype: int64


never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64


0    4861
1     249
Name: stroke, dtype: int64




# Continous Variables
## BMI
BMI is the only continuous variable with missing values.  Since only about 4% of the samples have missing BMI values, I will remove these rows from the dataset.

In [38]:
# Lets look at how much data is missing. Is it a small enough amount that we can just remove it?
print(stroke_df.isna().sum())

# number of rows with one or more missing values
print('\n')
print("Proportion of missing data: ", stroke_df.isna().sum().sum() / stroke_df.shape[0]) 

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


Proportion of missing data:  0.03933463796477495


In [39]:
# Remove all rows where BMI is missing
stroked_df_clean = stroke_df.dropna()

# Save cleaned up dataset to new csv file
save_path = 'stroke_data_cleaned.csv'
stroked_df_clean.to_csv(save_path, index=False)