In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline as imb_pipeline
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

data = pd.read_csv("/workspaces/4900/healthcare-dataset-stroke-data.csv")

balanced_data = data.dropna()
balanced_data = balanced_data.drop(columns= "id")
balanced_data = balanced_data[balanced_data.age > 20]
balanced_data = balanced_data.drop(balanced_data.index[balanced_data["gender"] == "Other"])
balanced_data = balanced_data.drop(balanced_data.index[balanced_data["work_type"] == "Never_worked"])
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', "bmi", "smoking_status"]
cat_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'hypertension', 'heart_disease']
num_columns = ['age', 'avg_glucose_level', 'bmi']

bal_data_train, bal_data_test = train_test_split(balanced_data, train_size = 0.8, random_state=1)
bal_data_train, bal_data_val = train_test_split(bal_data_train, train_size = 0.8, random_state=1)

X_train = bal_data_train.drop('stroke', axis = 1)
y_train = bal_data_train['stroke']

X_val = bal_data_val.drop('stroke', axis = 1)
y_val = bal_data_val['stroke']

X_test = bal_data_test.drop('stroke', axis = 1)
y_test = bal_data_test['stroke']

In [6]:
preprocessing = ColumnTransformer([
    ('one-hot-encoder', OneHotEncoder(sparse_output=False), cat_columns),
    ('scaler', StandardScaler(), num_columns)
])

In [29]:
balanced_data["work_type"].value_counts()

Private          2532
Self-employed     758
Govt_job          617
Name: work_type, dtype: int64

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

gs_pipeline = imb_pipeline(steps= [
        ("smotenc", SMOTENC(cat_columns, random_state = 1)),
        ('preprocessing', preprocessing),
        ('classifier', DecisionTreeClassifier(random_state = 1))
       ])

param_grid = {
                "smotenc__sampling_strategy": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
                "classifier":[SVC(random_state= 1)]
}

# param_grid = [
#                 {
#                 "smotenc__sampling_strategy": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
#                 "classifier": [DecisionTreeClassifier()],
#                 "classifier__max_depth": [1,2,3,4,5,6,7,8,9,10],
#                 "classifier__max_leaf_nodes":  [1,2,3,4,5,6,7,8,9,10]
#                 }
# ]

gs = GridSearchCV(
    gs_pipeline,
    param_grid,
    cv = 3,
    scoring= "recall"
)

gs.fit(X_train, y_train)

In [31]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.best_estimator_)

{'classifier': SVC(random_state=1), 'smotenc__sampling_strategy': 0.9}
0.30693983019564414
Pipeline(steps=[('smotenc',
                 SMOTENC(categorical_features=['gender', 'ever_married',
                                               'work_type', 'Residence_type',
                                               'smoking_status', 'hypertension',
                                               'heart_disease'],
                         random_state=1, sampling_strategy=0.9)),
                ('preprocessing',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['gender', 'ever_married',
                                                   'work_type',
                                                   'Residence_type',
                                                   'smoking_status',
                                                  

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html 

Look at stratify to set data balance before smotenc

Look into different hyper parameters of specific models, start with examples from class