Take the SVC which was the best in our last grid search and compare it to ensemble methods

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline as imb_pipeline
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv("healthcare-dataset-stroke-data.csv")

balanced_data = data.dropna()
balanced_data = balanced_data.drop(columns= "id")
balanced_data = balanced_data[balanced_data.age > 20]
balanced_data = balanced_data.drop(balanced_data.index[balanced_data["gender"] == "Other"])
balanced_data = balanced_data.drop(balanced_data.index[balanced_data["work_type"] == "Never_worked"])

features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', "bmi", "smoking_status"]
cat_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'hypertension', 'heart_disease']
num_columns = ['age', 'avg_glucose_level', 'bmi']

bal_data_train, bal_data_test = train_test_split(balanced_data, train_size = 0.8, random_state=1, stratify= balanced_data["stroke"])
bal_data_train, bal_data_val = train_test_split(bal_data_train, train_size = 0.8, random_state=1)

X_train = bal_data_train.drop('stroke', axis = 1)
y_train = bal_data_train['stroke']

X_val = bal_data_val.drop('stroke', axis = 1)
y_val = bal_data_val['stroke']

X_test = bal_data_test.drop('stroke', axis = 1)
y_test = bal_data_test['stroke']

In [2]:
preprocessing = ColumnTransformer([
    ('one-hot-encoder', OneHotEncoder(sparse_output=False), cat_columns),
    ('scaler', StandardScaler(), num_columns)
])

In [3]:
gs_pipeline = imb_pipeline(steps= [
        ("smotenc", SMOTENC(cat_columns, random_state = 1)),
        ('preprocessing', preprocessing),
        ('classifier', DummyClassifier(random_state = 1, strategy="stratified")) #Baseline comparator
       ])


param_grid = [
                {   #SVC is best one out of previous grid search
                    "smotenc__sampling_strategy": [0.7],
                    "classifier": [SVC(random_state=1, probability= True)],
                    'classifier__C': [1], 
                    'classifier__kernel': ["rbf"],
                    'classifier__gamma': [0.001]
                },

                # {
                #     "smotenc__sampling_strategy": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                #     "classifier": [RandomForestClassifier(random_state=1)],
                #     "classifier__bootstrap": [True, False], 
                #     'classifier__max_depth': [80, 90, 100, 110],
                #     'classifier__max_features': [2, 3],
                #     'classifier__min_samples_leaf': [3, 4, 5],
                #     'classifier__min_samples_split': [8, 10, 12],
                #     'classifier__n_estimators': [100, 200, 300, 1000]
                # }

                {   
                    "smotenc__sampling_strategy": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
                    "classifier": [RandomForestClassifier(random_state=1)],
                    'classifier__max_depth':[3,5,10,None],
                    'classifier__n_estimators':[10,100,200],
                    'classifier__max_features':[1,3,5,7],
                    'classifier__min_samples_leaf':[1,2,3],
                    'classifier__min_samples_split':[2,3]
                }
]

gs = GridSearchCV(
    gs_pipeline,
    param_grid,
    cv = 3,
    scoring= "f1",
    n_jobs= -1
)

gs.fit(X_train, y_train)

In [4]:
print("Best params:", gs.best_params_)
print("Score:", gs.best_score_)

Best params: {'classifier': SVC(probability=True, random_state=1), 'classifier__C': 1, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf', 'smotenc__sampling_strategy': 0.7}
Score: 0.2314838144129674


In [5]:
import joblib

best2 = gs.best_estimator_
joblib.dump(best2, "best_estimator_2.pkl")

['best_estimator_2.pkl']

In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

print("Training set")
print("F1", f1_score(y_train, best2.predict(X_train)))
print("Recall",recall_score(y_train, best2.predict(X_train)))
print("Precision",precision_score(y_train, best2.predict(X_train)))
print(confusion_matrix(y_train, best2.predict(X_train)))

print()
print("Val set")
print("F1", f1_score(y_val, best2.predict(X_val)))
print("Recall",recall_score(y_val, best2.predict(X_val)))
print("Precision",precision_score(y_val, best2.predict(X_val)))
print(confusion_matrix(y_val, best2.predict(X_val)))

print()
print("Test set")
print("F1", f1_score(y_test, best2.predict(X_test)))
print("Recall", recall_score(y_test, best2.predict(X_test)))
print("Precision",precision_score(y_test, best2.predict(X_test)))
print(confusion_matrix(y_test, best2.predict(X_test)))

Training set
F1 0.22429906542056074
Recall 0.65625
Precision 0.13526570048309178
[[1835  537]
 [  44   84]]

Val set
F1 0.2736842105263158
Recall 0.6842105263157895
Precision 0.17105263157894737
[[461 126]
 [ 12  26]]

Test set
F1 0.23529411764705888
Recall 0.6190476190476191
Precision 0.1452513966480447
[[587 153]
 [ 16  26]]
