In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv


In [2]:
#import dataset
data = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [3]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
#declare X and y
X = data.drop(columns='stroke')
y = data['stroke']

In [5]:
data.duplicated().sum()

0

no duplicates, good

In [6]:
data.isnull().sum().sort_values(ascending=False)

bmi                  201
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
smoking_status         0
stroke                 0
dtype: int64

In [7]:
data.isnull().sum().sort_values(ascending=False)/len(data)

bmi                  0.039335
id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
smoking_status       0.000000
stroke               0.000000
dtype: float64

NaNs are only 3% of data, this is fine. bmi being NaN means data was not retrieved for random patient listed.

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [9]:
data.rename(columns={"hypertension": "ATP2B1"})

Unnamed: 0,id,gender,age,ATP2B1,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [10]:
from sklearn.dummy import DummyRegressor #Regressor that makes predictions using simple rules.

baseline_model = DummyRegressor(strategy="mean") # Baseline
baseline_model.fit(X_train, y_train) # Calculate value for stratgy
baseline_model.score(X_train, y_train) # Score model based on consistently predicting the strategy

0.0

In [11]:
from sklearn.model_selection import cross_validate

In [12]:
cv_results = cross_validate(baseline_model, X, y, cv=5, 
                            scoring=['max_error',
                                     'r2'])

In [13]:
pd.DataFrame(cv_results)

Unnamed: 0,fit_time,score_time,test_max_error,test_r2
0,0.002644,0.001584,-1.0,-0.322122
1,0.001483,0.000934,-0.06091,0.0
2,0.00139,0.000836,-0.06091,0.0
3,0.001398,0.000864,-0.06091,0.0
4,0.001914,0.001158,-0.06091,0.0


In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

num_transformer = make_pipeline(SimpleImputer(), RobustScaler())
cat_transformer = OneHotEncoder()

prepoc = make_column_transformer(
    (num_transformer, ['age', 'bmi','avg_glucose_level']),
    (cat_transformer, ['gender', 'ever_married','work_type','Residence_type','smoking_status']),
    remainder='passthrough'
)

In [15]:
prepoc

In [16]:
from sklearn.linear_model import Ridge

pipeline = make_pipeline(prepoc, Ridge())
pipeline

In [17]:
# Train Pipeline
pipeline.fit(X_train,y_train)

# Make predictions
pipeline.predict(X_test.iloc[0:1])

# Score model
pipeline.score(X_train,y_train)

0.0852904789276605