In [2]:
params = {
                "LogisticRegression": {
                    'penalty': ['l1', 'l2', 'elasticnet', None],  # Regularization type
                    'C': [0.01, 0.1, 1, 10],                    # Inverse of regularization strength
                    'solver': ['liblinear', 'saga'],             # Optimization algorithms
                    'max_iter': [500, 1000, 1500, 2000]                  # Maximum iterations
                },
                "DecisionTreeClassifier": {
                    'criterion': ['gini', 'entropy', 'log_loss'],  # Split quality criterion
                    'splitter': ['best', 'random'],               # Split strategy
                    'max_depth': [None, 10, 20, 30, 50],          # Maximum tree depth
                    'min_samples_split': [2, 5, 10],              # Minimum samples for a split
                    'min_samples_leaf': [1, 2, 4],                # Minimum samples per leaf
                    'max_features': [None, 'sqrt', 'log2'],       # Number of features to consider
                },
                "RandomForestClassifier": {
                    'n_estimators': [100, 200, 500, 1000],         # Number of trees
                    'max_depth': [None, 10, 20, 30, 50],           # Maximum tree depth
                    'min_samples_split': [2, 5, 10],               # Minimum samples for a split
                    'min_samples_leaf': [1, 2, 4],                 # Minimum samples per leaf
                    'max_features': ['sqrt', 'log2', None],        # Features per split
                    'bootstrap': [True, False],                    # Bootstrapping strategy
                },
                "GradientBoostingClassifier": {
                    'loss': ['log_loss', 'deviance', 'exponential'],  # Loss function
                    'learning_rate': [0.01, 0.05, 0.1, 0.2],          # Step size
                    'n_estimators': [100, 200, 500],                  # Boosting stages
                    'max_depth': [3, 5, 7],                           # Maximum depth of individual estimators
                    'min_samples_split': [2, 5, 10],                  # Minimum samples for a split
                    'max_features': ['auto', 'sqrt', 'log2'],         # Features for splitting
                    'subsample': [0.8, 0.9, 1.0],                     # Sampling fraction
                },
                "KNeighborsClassifier": {
                    'n_neighbors': [3, 5, 7, 9, 11],                 # Number of neighbors
                    'weights': ['uniform', 'distance'],              # Prediction weighting
                    'metric': ['euclidean', 'manhattan', 'minkowski'], # Distance metric
                },
                "SVC": {
                    'C': [0.1, 1, 10, 100],                          # Regularization parameter
                    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
                    'gamma': ['scale', 'auto'],                      # Kernel coefficient
                },
                "XGBClassifier": {
                    'learning_rate': [0.01, 0.1, 0.2],               # Step size
                    'n_estimators': [100, 200, 500],                 # Boosting rounds
                    'max_depth': [3, 6, 10],                         # Tree depth
                    'subsample': [0.7, 0.8, 0.9, 1.0],               # Sampling fraction
                    'colsample_bytree': [0.7, 0.8, 1.0],             # Feature fraction
                    'min_child_weight': [1, 3, 5],                   # Minimum child weight
                },
                "CatBoostClassifier": {
                    'depth': [6, 8, 10],                             # Tree depth
                    'learning_rate': [0.01, 0.1, 0.2],               # Step size
                    'iterations': [100, 200, 500],                   # Boosting rounds
                    'l2_leaf_reg': [1, 3, 5],                        # Regularization term
                    'bagging_temperature': [0.0, 0.5, 1.0],         # Bagging randomness
                },
                "AdaBoostClassifier": {
                    'n_estimators': [50, 100, 200, 500],             # Boosting stages
                    'learning_rate': [0.01, 0.1, 0.5, 1.0],          # Step size
                    'algorithm': ['SAMME', 'SAMME.R'],               # Boosting algorithm
                }
            }

from catboost import CatBoostClassifier
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


models = {
                'LogisticRegression' : LogisticRegression(),
                'DecisionTreeClassifier' : DecisionTreeClassifier(),
                'RandomForestClassifier': RandomForestClassifier(),
                'AdaBoostClassifier': AdaBoostClassifier(),
                'GradientBoostingClassifier' : GradientBoostingClassifier(),
                'KNeighborsClassifier' : KNeighborsClassifier(),
                'Support Vector Classifier' : SVC(),
                'CatBoostClassifier' : CatBoostClassifier(task_type='GPU', devices='0'),
                'XGBClassifier' : XGBClassifier(tree_method='gpu_hist',gpu_id= 0,  max_depth=6, max_bin=256)
            }

import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


pipeline = Pipeline(
                steps=[
                    ('Imputer', SimpleImputer(strategy='most_frequent')),
                    ('OneHotEncoder', OneHotEncoder(sparse_output=True)),
                    # ('Standard Scaler', StandardScaler(with_mean=False))
                ]
            )

            # Define the list of features to transform
features = [
                'HighBP', 'HighChol', 'BMI', 'Smoker',
                'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
                'HvyAlcoholConsump', 'AnyHealthcare', 'GenHlth', 'DiffWalk', 'Sex',
                'Mental_Health_Category', 'Physical_Health_Category'
            ]

            # Create a ColumnTransformer with the defined pipeline
preprocessor = ColumnTransformer([('pipeline', pipeline, features)])

train_df = pd.read_csv('artifacts/train.csv')
test_df = pd.read_csv('artifacts/test.csv')

target_column = 'HeartDiseaseorAttack'
input_feature_train_df = train_df.drop(columns=[target_column], axis=1)
target_feature_train_df = train_df[target_column]

input_feature_test_df = test_df.drop(columns=[target_column], axis=1)
target_feature_test_df = test_df[target_column]     

input_feature_train_arr = preprocessor.fit_transform(input_feature_train_df)
input_feature_test_arr = preprocessor.transform(input_feature_test_df)

train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

X_train, y_train = train_arr[:, :-1], train_arr[:, -1]


from sklearn.model_selection import GridSearchCV, cross_val_score

print('starting model training')
def evaluate_model(X_train, y_train, models, params):
    for model_name, model_instance in models.items():
      print(model_name)
      model_params = params.get(model_name, {})
              
      gs = GridSearchCV(model_instance, model_params, cv=3, scoring='accuracy', n_jobs= -1)
      gs.fit(X_train, y_train)

      print(gs.best_params_)


starting model training


In [3]:
evaluate_model(X_train,y_train,models,params)

LogisticRegression


144 fits failed out of a total of 384.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kesha\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\kesha\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kesha\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(s

{'C': 0.01, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
DecisionTreeClassifier


KeyboardInterrupt: 