In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Load the healthcare dataset
df = pd.read_csv('healthcare_dataset.csv')

# Split the data into train and test sets
X = df.drop('stroke', axis=1)
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the classifiers and their parameter grids for GridSearchCV
rf = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]}
dt = DecisionTreeClassifier(random_state=42)
dt_params = {'max_depth': [None, 5, 10, 15, 20], 'min_samples_split': [2, 5, 10]}
lr = LogisticRegression(random_state=42)
lr_params = {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}
knn = KNeighborsClassifier()
knn_params = {'n_neighbors': [3, 5, 7, 9]}

# Define the data techniques
oversampler = SMOTE(random_state=42)
undersampler = RandomUnderSampler(random_state=42)

# List to store the results
results = []

# Loop through different data techniques
for i, (name, sampler) in enumerate({'Original Data': None, 'SMOTE Data': oversampler, 'Undersampled Data': undersampler}.items()):
    if sampler is not None:
        X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    else:
        X_train_res, y_train_res = X_train, y_train
    
    # Loop through different classifiers
    for j, (clf_name, clf, params) in enumerate({'Random Forest': (rf, rf_params), 'Decision Tree': (dt, dt_params),
                                                 'Logistic Regression': (lr, lr_params), 'KNN': (knn, knn_params)}.items()):
        # Perform GridSearchCV with 5-fold cross validation
        grid_clf = GridSearchCV(clf, params, cv=5)
        grid_clf.fit(X_train_res, y_train_res)
        best_clf = grid_clf.best_estimator_
        
        # Make predictions on test data
        y_pred = best_clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred)
        
        # Store the results
        results.append({'Data Technique': name, 'Classifier': clf_name, 'Best Params': grid_clf.best_params_,
                        'Validation Accuracy': grid_clf.best_score_, 'Test Accuracy': accuracy, 'Classification Report': classification_rep})

# Convert results to dataframe
results_df = pd.DataFrame(results)

# Print the results
print(results_df)


FileNotFoundError: [Errno 2] No such file or directory: 'healthcare_dataset.csv'