### Without GridSearchCV

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
import warnings
warnings.filterwarnings("ignore")


# Load dataset
df = pd.read_csv("ready_for_ML.csv")
df.head()

# Split the data into training and testing sets
X = df.drop(columns="HadHeartAttack")
y = df["HadHeartAttack"]
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=42, stratify=y)

# Training our models on training dataset with default hyper-parameters:
estimators_nogs = [
    ('random_forest', RandomForestClassifier()),
    ('svm', SVC()),
    ('logistic_regression', LogisticRegression()),
    ('Decision_tree', DecisionTreeClassifier()),
    ('SGD', SGDClassifier())
]

param_grids_nogs = {
    'random_forest': {},
    'svm': {},
    'logistic_regression': {},
    'Decision_tree': {},
    'SGD': {}
}

best_models_nogs = {}

for name, estimator in estimators_nogs:
    grid_search = GridSearchCV(estimator, param_grids_nogs[name], scoring=['accuracy', 'precision', 'recall'], refit='precision', cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models_nogs[name] = [grid_search.best_estimator_, grid_search.best_score_]

best_models_nogs


{'random_forest': [RandomForestClassifier(), 0.7677120847563778],
 'svm': [SVC(), 0.807542692935967],
 'logistic_regression': [LogisticRegression(), 0.7810540716209525],
 'Decision_tree': [DecisionTreeClassifier(), 0.5176839350841428],
 'SGD': [SGDClassifier(), 0.8100487075677753]}

### With GridSearchCV

In [2]:
# Constructing a grid space of hyper-parameters for our models:
estimators_gs = [
    ('random_forest', RandomForestClassifier()),
    ('svm', SVC()),
    ('logistic_regression', LogisticRegression()),
    ('Decision_tree', DecisionTreeClassifier()),
    ('SGD', SGDClassifier())
]

# Defining a parameter grid space for each estimator:
param_grids_gs = {
    'random_forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20]
    },
    'svm': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'logistic_regression': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2']
    },
    'Decision_tree': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'SGD': {
        'loss': ['hinge', 'log', 'perceptron'],
        'penalty': ['l1', 'l2', 'elasticnet']
    }
}

best_models_gs = {}

for name, estimator in estimators_gs:
    grid_search = GridSearchCV(estimator, param_grids_gs[name], scoring=['accuracy', 'precision', 'recall'], refit='precision', cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models_gs[name] = [grid_search.best_estimator_, grid_search.best_score_]

best_models_gs

9 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\faran\anaconda3\envs\dev\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\faran\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\faran\anaconda3\envs\dev\lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\faran\anaconda3\envs\dev\lib\site-packages\sklearn\linear_model

{'random_forest': [RandomForestClassifier(max_depth=10, n_estimators=200),
  0.8003369594908306],
 'svm': [SVC(C=0.1, kernel='linear'), 0.8100487075677753],
 'logistic_regression': [LogisticRegression(C=1), 0.7810540716209525],
 'Decision_tree': [DecisionTreeClassifier(max_depth=10), 0.6889667737117656],
 'SGD': [SGDClassifier(penalty='l1'), 0.8100487075677753]}

### GridSearchCV on the models using only the selected columns

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
# Columns to keep
columns_to_keep = ['HadAngina', 'BMI', 'WeightInKilograms', 'HeightInMeters', 'SleepHours', 'PhysicalHealthDays', 'MentalHealthDays']

# Filter the dataset to include only the selected columns plus the target column
filtered_df = df[columns_to_keep + ['HadHeartAttack']]

# Split the data into training and testing sets
X = filtered_df.drop(columns="HadHeartAttack")
y = filtered_df["HadHeartAttack"]
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=42, stratify=y)

# Constructing a grid space of hyper-parameters for our models
estimators_fs = [
    ('random_forest', RandomForestClassifier()),
    ('svm', SVC()),
    ('logistic_regression', LogisticRegression()),
    ('Decision_tree', DecisionTreeClassifier()),
    ('SGD', SGDClassifier())
]

# Defining a parameter grid space for each estimator
param_grids_fs = {
    'random_forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20]
    },
    'svm': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'logistic_regression': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2']
    },
    'Decision_tree': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'SGD': {
        'loss': ['hinge', 'log', 'perceptron'],
        'penalty': ['l1', 'l2', 'elasticnet']
    }
}

best_models_fs = {}

for name, estimator in estimators_fs:
    grid_search = GridSearchCV(estimator, param_grids_fs[name], scoring=['accuracy', 'precision', 'recall'], refit='precision', cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models_fs[name] = [grid_search.best_estimator_, grid_search.best_score_]

best_models_fs


{'random_forest': [RandomForestClassifier(max_depth=10), 0.8087948439500164],
 'svm': [SVC(C=0.1, kernel='linear'), 0.8100487075677753],
 'logistic_regression': [LogisticRegression(C=0.1), 0.8100487075677753],
 'Decision_tree': [DecisionTreeClassifier(max_depth=10, min_samples_split=5),
  0.7705721239702964],
 'SGD': [SGDClassifier(penalty='l1'), 0.8100487075677753]}

In [16]:
trained_feature_names = {
    'nogs': X_train.columns.tolist(),
    'gs': X_train.columns.tolist(),
    'fs': X_train[columns_to_keep].columns.tolist()
}
trained_feature_names

{'nogs': ['HadAngina',
  'BMI',
  'WeightInKilograms',
  'HeightInMeters',
  'SleepHours',
  'PhysicalHealthDays',
  'MentalHealthDays'],
 'gs': ['HadAngina',
  'BMI',
  'WeightInKilograms',
  'HeightInMeters',
  'SleepHours',
  'PhysicalHealthDays',
  'MentalHealthDays'],
 'fs': ['HadAngina',
  'BMI',
  'WeightInKilograms',
  'HeightInMeters',
  'SleepHours',
  'PhysicalHealthDays',
  'MentalHealthDays']}

In [22]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate a single model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred)
    }

# Function to evaluate all models in a dictionary
def evaluate_all_models(best_models, X_test, y_test):
    results = {}
    for name, model_info in best_models.items():
        model = model_info[0]  # The best estimator
        results[name] = evaluate_model(model, X_test, y_test)
    return pd.DataFrame(results).T


# Split the data into training and testing sets (same as the original fs models)
X_test_fs = filtered_df.drop(columns="HadHeartAttack")
y_test_fs = filtered_df["HadHeartAttack"]

# Evaluate the fs models
fs_results = evaluate_all_models(best_models_fs, X_test_fs, y_test_fs)

# Display the results
print("Results with Selected Features and Grid Search Hyperparameters:")
print(fs_results)


Results with Selected Features and Grid Search Hyperparameters:
                     accuracy  precision    recall  f1_score
random_forest        0.867810   0.809289  0.508963  0.624916
svm                  0.866401   0.805207  0.504578  0.620392
logistic_regression  0.866401   0.805207  0.504578  0.620392
Decision_tree        0.869386   0.808534  0.519280  0.632401
SGD                  0.866401   0.805207  0.504578  0.620392


In [26]:
import pandas as pd
from sklearn.metrics import confusion_matrix

# Columns to keep
columns_to_keep = ['HadAngina', 'BMI', 'WeightInKilograms', 'HeightInMeters', 'SleepHours', 'PhysicalHealthDays', 'MentalHealthDays']

# Filter the dataset to include only the selected columns plus the target column
filtered_df = df[columns_to_keep + ['HadHeartAttack']]

# Split the data into training and testing sets (same as the original fs models)
X_test_fs = filtered_df.drop(columns="HadHeartAttack")
y_test_fs = filtered_df["HadHeartAttack"]

# Print confusion matrices for each fs model
for name, model_info in best_models_fs.items():
    model = model_info[0]  # The best estimator
    predictions = model.predict(X_test_fs)
    cm = confusion_matrix(y_test_fs, predictions)
    print(f"Confusion Matrix for {name}:\n{cm}\n")


Confusion Matrix for random_forest:
[[54309  1860]
 [ 7615  7893]]

Confusion Matrix for svm:
[[54276  1893]
 [ 7683  7825]]

Confusion Matrix for logistic_regression:
[[54276  1893]
 [ 7683  7825]]

Confusion Matrix for Decision_tree:
[[54262  1907]
 [ 7455  8053]]

Confusion Matrix for SGD:
[[54276  1893]
 [ 7683  7825]]

