In [1]:
import pandas as pd
data = pd.read_csv("/workspaces/4900/healthcare-dataset-stroke-data.csv")

balanced_data = data.dropna()
balanced_data = balanced_data.drop(columns= "id")
balanced_data = balanced_data[balanced_data.age > 20]
balanced_data = balanced_data.drop(balanced_data.index[balanced_data["gender"] == "Other"])

features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', "bmi", "smoking_status"]
cat_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'hypertension', 'heart_disease']
num_columns = ['age', 'avg_glucose_level', 'bmi']

In [2]:
from sklearn.model_selection import train_test_split

bal_data_train, bal_data_test = train_test_split(balanced_data, train_size = 0.8, random_state=1)


X_train = bal_data_train.drop('stroke', axis = 1)
y_train = bal_data_train['stroke']

X_test = bal_data_test.drop('stroke', axis = 1)
y_test = bal_data_test['stroke']


In [6]:
from imblearn.pipeline import Pipeline as imb_pipeline
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import joblib

smotenc_pipeline = joblib.load("/workspaces/4900/data_cleaning_3/pipeline_for_app_3.pkl")

smotenc_pipeline.fit(X_train, y_train)

smotenc_pipeline_prediction = smotenc_pipeline.predict(X_test)

from sklearn.metrics import recall_score

print("Recall score for smotenc test set:", recall_score(y_test, smotenc_pipeline_prediction))

Recall score for smotenc test set: 0.2765957446808511


What if we use a different resampling stratgety percentages?

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocessing = ColumnTransformer([
    ('one-hot-encoder', OneHotEncoder(sparse_output=False), cat_columns),
    ('scaler', StandardScaler(), num_columns)
])

percentages = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

for x in percentages:
    test_pipeline = imb_pipeline(steps= [
        ("smotenc", SMOTENC(cat_columns, random_state = 1, sampling_strategy= x)),
        ('preprocessing', preprocessing),
        ('model', DecisionTreeClassifier(random_state = 1))
       ])
    
    test_pipeline.fit(X_train, y_train)
    test_pipeline_prediction = test_pipeline.predict(X_test)
    print(f"Recall score for {x*100}% (Test):", recall_score(y_test, test_pipeline_prediction))

Recall score for 10.0% (Test): 0.2553191489361702
Recall score for 20.0% (Test): 0.23404255319148937
Recall score for 30.0% (Test): 0.1702127659574468
Recall score for 40.0% (Test): 0.23404255319148937
Recall score for 50.0% (Test): 0.2978723404255319
Recall score for 60.0% (Test): 0.19148936170212766
Recall score for 70.0% (Test): 0.2553191489361702
Recall score for 80.0% (Test): 0.2765957446808511
Recall score for 90.0% (Test): 0.23404255319148937
Recall score for 100% (Test): 0.2765957446808511


Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(
    
)