In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
from sklearn import tree

In [33]:
stroke_df = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [34]:
stroke_copy = stroke_df.copy()

In [35]:
stroke_copy.dropna()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [36]:
X = stroke_copy.drop(columns = 'stroke')
y = stroke_copy['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [37]:
X_train.isna().sum()/len(stroke_copy)

id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  0.029354
smoking_status       0.000000
dtype: float64

In [38]:
# check out all of the unique values in work type, residence type, and smoking status columns
stroke_copy['work_type'].value_counts()

work_type
Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: count, dtype: int64

In [39]:
# check out all of the unique values in work type, residence type, and smoking status columns
stroke_copy['smoking_status'].value_counts()

smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: count, dtype: int64

In [40]:
# Get dummies on work_type
stroke_copy_dummies = pd.get_dummies(stroke_copy, columns=['work_type', 'smoking_status'], dtype=int)
stroke_copy_dummies

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,Male,67.0,0,1,Yes,Urban,228.69,36.6,1,0,0,1,0,0,0,1,0,0
1,51676,Female,61.0,0,0,Yes,Rural,202.21,,1,0,0,0,1,0,0,0,1,0
2,31112,Male,80.0,0,1,Yes,Rural,105.92,32.5,1,0,0,1,0,0,0,0,1,0
3,60182,Female,49.0,0,0,Yes,Urban,171.23,34.4,1,0,0,1,0,0,0,0,0,1
4,1665,Female,79.0,1,0,Yes,Rural,174.12,24.0,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Urban,83.75,,0,0,0,1,0,0,0,0,1,0
5106,44873,Female,81.0,0,0,Yes,Urban,125.20,40.0,0,0,0,0,1,0,0,0,1,0
5107,19723,Female,35.0,0,0,Yes,Rural,82.99,30.6,0,0,0,0,1,0,0,0,1,0
5108,37544,Male,51.0,0,0,Yes,Rural,166.29,25.6,0,0,0,1,0,0,0,1,0,0


In [41]:
# check out all of the unique values in work type, residence type, and smoking status columns
stroke_copy['Residence_type'].value_counts()

Residence_type
Urban    2596
Rural    2514
Name: count, dtype: int64

In [44]:
# check out all of the unique values in work type, residence type, and smoking status columns
stroke_copy['gender'].value_counts()

gender
Female    2994
Male      2115
Other        1
Name: count, dtype: int64

In [45]:
# Use OrdinalEncoder on the gender and Residence_type columns
encoder = OrdinalEncoder(categories=[['Male', 'Female', 'Other'], ['Urban', 'Rural']])
stroke_copy_dummies[['gender','Residence_type']] = encoder.fit_transform(stroke_copy_dummies[['gender','Residence_type']])
stroke_copy_dummies

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,0.0,67.0,0,1,Yes,0.0,228.69,36.6,1,0,0,1,0,0,0,1,0,0
1,51676,1.0,61.0,0,0,Yes,1.0,202.21,,1,0,0,0,1,0,0,0,1,0
2,31112,0.0,80.0,0,1,Yes,1.0,105.92,32.5,1,0,0,1,0,0,0,0,1,0
3,60182,1.0,49.0,0,0,Yes,0.0,171.23,34.4,1,0,0,1,0,0,0,0,0,1
4,1665,1.0,79.0,1,0,Yes,1.0,174.12,24.0,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,1.0,80.0,1,0,Yes,0.0,83.75,,0,0,0,1,0,0,0,0,1,0
5106,44873,1.0,81.0,0,0,Yes,0.0,125.20,40.0,0,0,0,0,1,0,0,0,1,0
5107,19723,1.0,35.0,0,0,Yes,1.0,82.99,30.6,0,0,0,0,1,0,0,0,1,0
5108,37544,0.0,51.0,0,0,Yes,1.0,166.29,25.6,0,0,0,1,0,0,0,1,0,0


In [None]:
df.drop(['A', 'C'], axis=1, inplace=True)