In [1]:
import pandas as pd
import numpy as np

In [2]:
stroke = pd.read_csv('stroke.csv')
stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
stroke = stroke.drop('id',axis=1)

In [4]:
X = stroke.iloc[:,:-1]
Y = stroke.iloc[:,-1]

from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

num_cols = trainX.columns[(trainX.dtypes == np.int64) | (trainX.dtypes == np.float64)]

num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

#get a list of class columns
cat_cols = trainX.columns[trainX.dtypes==object]

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant',fill_value='missing')),
    ('encode', OneHotEncoder())
])

from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ('numeric', num_pipeline, num_cols),
    ('class', cat_pipeline, cat_cols)
])

trainX_prc = full_pipeline.fit_transform(trainX)
testX_prc = full_pipeline.transform(testX)  

trainX_prc.shape, testX_prc.shape

((4088, 21), (1022, 21))

In [5]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced')

lr.fit(trainX_prc, trainY)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
lr.score(trainX_prc, trainY)

0.7419275929549902

In [7]:
from sklearn.metrics import f1_score

f1_score(trainY, lr.predict(trainX_prc))

0.23936553713049744

In [8]:
lr.score(testX_prc, testY)

0.7299412915851272

In [9]:
f1_score(testY, lr.predict(testX_prc))

0.21142857142857144

In [11]:
from sklearn.svm import SVC

svc = SVC(class_weight='balanced')
svc.fit(trainX_prc, trainY)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
svc.score(trainX_prc, trainY)

0.7871819960861057

In [15]:
f1_score(trainY, svc.predict(trainX_prc))

0.2926829268292683

In [14]:
svc.score(testX_prc, testY)

0.7729941291585127

In [16]:
f1_score(testY, svc.predict(testX_prc))

0.21088435374149658