1) Przyjrzyj się danym - zrób prosty EDA.
2) Postaw sobie cel - co właściwie chcemy zamodelować?
3) Wykonaj feature engineering - scaling, encoding, itp.
    a) Jakie operacje powinniśmy wykonać na zmiennych?
    b) Spróbuj wykorzystać ColumnTransformer i pipeliny z biblioteki scikit-learn.
4) Zaproponuj pierwszy prosty model (koncepcja 'minimum viable product').
5) Może powinniśmy wykorzystać regularyzację?
6) Spróbuj przetestować kilka modeli (np. logistic regression, decision tree). Wykorzystaj do tego crosswalidację.
7) Z przetestowanych modeli wybierz najlepszą klasę modeli.
8) Zoptymalizuj hiperparametry wykorzystując RandomSearch/GridSearch oraz crosswalidację.
9) Może warto dokonać jakiegoś ensemblingu?

In [2]:
import pandas as pd

df = pd.read_csv("healthcare-dataset-stroke-data.csv")

df.head(10)


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [14]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [28]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline 
import numpy as np 
from sklearn.compose import ColumnTransformer

numerical_features = ["age", "avg_glucose_level", "bmi"]
categorical_features = ["work_type", "smoking_status", "ever_married", "gender", "Residence_type"]
binary_features = ["hypertension", "heart_disease", "stroke",]


numerical_pipeline = Pipeline(steps= [('impute', SimpleImputer(strategy='mean')), ('scale', StandardScaler())])
categorical_pipeline = Pipeline(steps =[('impute', SimpleImputer(strategy='most_frequent')), ('encode', OneHotEncoder()) ])
binary_pipeline = Pipeline(steps =[('impute', SimpleImputer(strategy='most_frequent'))])


full_processor = ColumnTransformer(transformers= [
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features),
    ('bin', binary_pipeline, binary_features)
],
remainder='passthrough')

fully_processed = full_processor.fit_transform(df)


df_modified = pd.DataFrame(fully_processed, columns=full_processor.get_feature_names_out())




In [38]:
df_modified

Unnamed: 0,num__age,num__avg_glucose_level,num__bmi,cat__work_type_Govt_job,cat__work_type_Never_worked,cat__work_type_Private,cat__work_type_Self-employed,cat__work_type_children,cat__smoking_status_Unknown,cat__smoking_status_formerly smoked,...,cat__ever_married_Yes,cat__gender_Female,cat__gender_Male,cat__gender_Other,cat__Residence_type_Rural,cat__Residence_type_Urban,bin__hypertension,bin__heart_disease,bin__stroke,remainder__id
0,1.051434,2.706375,1.001234e+00,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,9046.0
1,0.786070,2.121559,4.615554e-16,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,51676.0
2,1.626390,-0.005028,4.685773e-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,31112.0
3,0.255342,1.437358,7.154182e-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,60182.0
4,1.582163,1.501184,-6.357112e-01,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1665.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,1.626390,-0.494658,4.615554e-16,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,18234.0
5106,1.670617,0.420775,1.442949e+00,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,44873.0
5107,-0.363842,-0.511443,2.217363e-01,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,19723.0
5108,0.343796,1.328257,-4.278451e-01,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,37544.0


In [46]:

# there are 4861 patients who have not suffered from a stroke
df[df_modified["bin__stroke"] == 0].count()

# there are 249 patients who have experienced a stroke
df[df_modified["bin__stroke"] == 1].count()

id                   249
gender               249
age                  249
hypertension         249
heart_disease        249
ever_married         249
work_type            249
Residence_type       249
avg_glucose_level    249
bmi                  209
smoking_status       249
stroke               249
dtype: int64

In [48]:
#preparing data for modelling

from sklearn.model_selection import train_test_split

X = df_modified.drop("bin__stroke", axis =1)
y = df_modified["bin__stroke"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=42)

In [79]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

log_reg_pred =log_reg.predict(X_test)

log_reg_accuracy = accuracy_score(y_test, log_reg_pred)

print(f"Accuracy for a Logistic Regression model amounts to: {log_reg_accuracy}.")



Accuracy for a Logistic Regression model amounts to: 0.9419439008480104.


In [77]:
# cross validation for Logistic Regression

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, precision_score, f1_score, recall_score

cross_val_sc = cross_val_score(log_reg, X_train, y_train, cv=10, scoring=make_scorer(accuracy_score))

print(f"Cross-validation accuracy scores are: {cross_val_sc}")

precision_scorer = make_scorer(precision_score, zero_division = 'warn')
precision_val_sc = cross_val_score(log_reg, X_train, y_train, cv =10, scoring=make_scorer(precision_score))
print(f"Cross-validation precision scores are: {precision_val_sc}")

Cross-validation accuracy scores are: [0.95530726 0.95530726 0.95530726 0.95530726 0.95530726 0.95530726
 0.95530726 0.95518207 0.95518207 0.95518207]
Cross-validation precision scores are: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
# Leave One Out Cross Validation

from sklearn.model_selection import LeaveOneOut

loocv = LeaveOneOut()
loocv_accuracy_sc = cross_val_score(log_reg, X_train, y_train, cv=loocv, scoring=make_scorer(accuracy_score))



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [98]:
#As compiling the above code results in a warning "ConvergenceWarning: lbfgs failed to converge (status=1):
# STOP: TOTAL NO. of ITERATIONS REACHED LIMIT." I will change the maximum number of iterations for logistic regression.

log_reg_higher_limit = LogisticRegression(max_iter=1000)
log_reg_higher_limit.fit(X_train,y_train)
log_reg_hl_pred= log_reg_higher_limit.predict(X_test)

loocv = LeaveOneOut()
loocv_accuracy_sc_reg_higher = cross_val_score(log_reg_higher_limit, X_train, y_train, cv=loocv, scoring=make_scorer(accuracy_score))

print(f"Accuracy for a Logistic Regression (with increased max_iter to 1000) model (with Leave One Out cross validation) amounts to: {loocv_accuracy_sc_reg_higher}. Mean accuracy of the model is {loocv_accuracy_sc_reg_higher.mean()}.")

Accuracy for a Logistic Regression (with increased max_iter to 1000) model (with Leave One Out cross validation) amounts to: [1. 1. 1. ... 1. 1. 1.]. Mean accuracy of the model is 0.9552697791445345


In [92]:
loocv_accuracy_sc_reg_higher.mean()

0.9552697791445345

In [96]:
len(loocv_accuracy_sc_reg_higher)

3577

In [56]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

tree_model_predict = tree_model.predict(X_test)

tree_model_accuracy = accuracy_score(y_test, tree_model_predict)

print(f"Accuracy for a Decision Tree model amounts to: {tree_model_accuracy}.")

Accuracy for a Decision Tree model amounts to: 0.903457273320287.


In [99]:
# Bagging model 

from sklearn.ensemble import BaggingClassifier

base_classifier = DecisionTreeClassifier(random_state=42)

bagging_model = BaggingClassifier(base_estimator=base_classifier, n_estimators=100,random_state=42)

bagging_model.fit(X_train, y_train)

bagging_model_pred = bagging_model.predict(X_test)

bagging_model_accuracy = accuracy_score(y_test, bagging_model_pred)

print(f"Accuracy for a Bagging Classifier (base estimator: Decision Tree) amounts to: {bagging_model_accuracy}.")



Accuracy for a Bagging Classifier (base estimator: Decision Tree) amounts to: 0.9399869536855838.


In [106]:
# Implementing Adaptive Boosting with Decision Tree as an estimator and learning rate equalling 1

from sklearn.ensemble import AdaBoostClassifier


ada_boost = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_boost.fit(X_train, y_train)
ada_boost_predict = ada_boost.predict(X_test)

ada_boost_accuracy = accuracy_score(y_test,ada_boost_predict)

print(f"Accuracy for an Adaptive Boosting model (base estimator: Decision Tree) amounts to: {ada_boost_accuracy}.")

Accuracy for an Adaptive Boosting model (base estimator: Decision Tree) amounts to: 0.9393346379647749.


In [111]:
from sklearn.decomposition import PCA


num_com = 3

pca = PCA(n_components=num_com)
X_pca = pca.fit(X)

exp_var_ratio = pca.explained_variance_ratio_

exp_var_ratio

array([9.99999987e-01, 4.05923285e-09, 1.98824632e-09])