# Exploratory Data Analysis, Data Pre Processing and Feature selection

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler



from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV

from sklearn.metrics import f1_score, confusion_matrix, recall_score

In [50]:
#Y
df = pd.read_csv('train.csv').set_index('encounter_id')

In [51]:
#Y
X_test = pd.read_csv('test.csv').set_index('encounter_id')

In [52]:
def prepare_data(df):

    train = df.drop(columns=['readmitted_binary', 'readmitted_multiclass'])
    target_multiclass = df['readmitted_multiclass']
    target_binary = df['readmitted_binary']
    return train, target_multiclass, target_binary

In [53]:
def process_data(train, X_test):
    # Step 1: Add hospital_visits column
    train['hospital_visits'] = train.groupby('patient_id')['patient_id'].transform('count')
    X_test['hospital_visits'] = X_test.groupby('patient_id')['patient_id'].transform('count')
    
    # Step 3: Calculate Emergency_visits/total_visits ratio
    train['Emergency_visits/total_visits'] = train['emergency_visits_in_previous_year'] / (
        train['inpatient_visits_in_previous_year'] + train['outpatient_visits_in_previous_year'] + train['emergency_visits_in_previous_year'])
    X_test['Emergency_visits/total_visits'] = X_test['emergency_visits_in_previous_year'] / (
        X_test['inpatient_visits_in_previous_year'] + X_test['outpatient_visits_in_previous_year'] + X_test['emergency_visits_in_previous_year'])
    
    # Step 4: Calculate n_medications/length_of_stay ratio
    train['n_medications/length_of_stay'] = train['number_of_medications'] / train['length_of_stay_in_hospital']
    X_test['n_medications/length_of_stay'] = X_test['number_of_medications'] / X_test['length_of_stay_in_hospital']
    
    return train, X_test


In [54]:
def feature_engineering(train, X_test, target_binary_col='readmitted_binary', target_multiclass_col='readmitted_multiclass'):
    # Step 1: Prepare train, target_multiclass, and target_binary
    train = train.drop(columns=[target_binary_col, target_multiclass_col])
    target_multiclass = train[target_multiclass_col]
    target_binary = train[target_binary_col]
    
    # Step 2: Calculate hospital_visits per patient
    train['hospital_visits'] = train.groupby('patient_id')['patient_id'].transform('count')
    X_test['hospital_visits'] = X_test.groupby('patient_id')['patient_id'].transform('count')
    
    # Step 3: Calculate Emergency_visits/total_visits ratio
    train['Emergency_visits/total_visits'] = train['emergency_visits_in_previous_year'] / (
            train['inpatient_visits_in_previous_year'] + train['outpatient_visits_in_previous_year'] + train[
        'emergency_visits_in_previous_year'])
    X_test['Emergency_visits/total_visits'] = X_test['emergency_visits_in_previous_year'] / (
            X_test['inpatient_visits_in_previous_year'] + X_test['outpatient_visits_in_previous_year'] + X_test[
        'emergency_visits_in_previous_year'])
    
    # Step 4: Calculate n_medications/lenght_of_stay ratio
    train['n_medications/lenght_of_stay'] = train['number_of_medications'] / train['length_of_stay_in_hospital']
    X_test['n_medications/lenght_of_stay'] = X_test['number_of_medications'] / X_test['length_of_stay_in_hospital']
    
    return train, X_test, target_multiclass, target_binary

In [55]:
def clean_data(df):
    # Strip whitespace from string values
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    
    # Replace specific placeholder values with NaN
    df.replace('?', np.nan, inplace=True)
    df.replace('Unknown/Invalid', np.nan, inplace=True)
    df.replace(['Not Mapped', 'Not Available'], np.nan, inplace=True)
    
    # Drop the 'country' column if it exists
    if 'country' in df.columns:
        df.drop('country', axis=1, inplace=True)
    
    # Fill NaNs in specific columns with 'Not_tested'
    if 'glucose_test_result' in df.columns:
        df['glucose_test_result'].fillna('Not_tested', inplace=True)
    if 'a1c_test_result' in df.columns:
        df['a1c_test_result'].fillna('Not_tested', inplace=True)
    
    # Map 'payer_code' column: NaN to 0, others to 1
    if 'payer_code' in df.columns:
        df['payer_code'] = df['payer_code'].map(lambda x: 0 if pd.isna(x) else 1)
    
    return df

def process_age_column(df):
    def col_age(age):
        if not pd.isna(age):
            new_value = age.split('-')
            age_1 = int(new_value[0].strip('['))
            age_2 = int(new_value[1].strip(')'))
            return (age_1 + age_2) / 2
        else:
            return np.nan
    
    if 'age' in df.columns:
        df['age'] = df['age'].apply(col_age)
    
    return df

def drop_newborn_outliers(df, target_df):
    # Find indices to drop
    to_drop = df[(df['admission_type'] == 'Newborn') & (df['age'] > 5)].index.to_list()
    # Drop rows in the main DataFrame
    df.drop(to_drop, inplace=True)
    # Drop rows in the target DataFrame
    target_df.drop(to_drop, inplace=True)
    return df, target_df

def drop_newborn_outliers1(df):
    # Find indices to drop
    to_drop = df[(df['admission_type'] == 'Newborn') & (df['age'] > 5)].index.to_list()
    # Drop rows in the main DataFrame
    df.drop(to_drop, inplace=True)
    return df

def fill_missing_age(train, test):
    # Fill missing 'age' values in train set
    train['age'] = train.groupby('patient_id')['age'].transform(lambda x: x.fillna(x.mean()))
    # Calculate mean age per patient
    train_patient_means = train.groupby('patient_id')['age'].mean()
    # Fill missing 'age' values in test set using train's patient means
    test['age'] = test['patient_id'].map(train_patient_means).where(test['age'].isnull(), test['age'])
    return train, test

def fill_missing_race(train, test):
    # Fill missing 'race' values in train set
    train['race'] = train.groupby('patient_id')['race'].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else np.nan))
    # Calculate mode race per patient
    modes = train.groupby('patient_id')['race'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    # Fill missing 'race' values in test set using train's patient modes
    test['race'] = test['patient_id'].map(modes).where(test['race'].isnull(), test['race'])
    return train, test

def impute_missing_values(train, test):
    # Impute missing values for numeric columns with mean
    numeric = train.select_dtypes(include=np.number).columns
    imputer_numeric = SimpleImputer(strategy='mean')
    imputer_numeric.fit(train[numeric])
    train[numeric] = imputer_numeric.transform(train[numeric])
    test[numeric] = imputer_numeric.transform(test[numeric])
    
    # Impute missing values for categorical columns with the most frequent value
    object_columns = train.select_dtypes(include=['object']).columns
    imputer_object = SimpleImputer(strategy='most_frequent')
    imputer_object.fit(train[object_columns])
    train[object_columns] = imputer_object.transform(train[object_columns])
    test[object_columns] = imputer_object.transform(test[object_columns])
    
    return train, test

def map_icd9_to_category(df):
    def icd9_to_category(icd9):
        if not pd.isna(icd9):
            if icd9[0] == 'E' or icd9[0] == 'V':
                return 'other health factors and external causes'
            else:
                icd9 = float(icd9)
                if icd9 >= 1 and icd9 <= 139:
                    return 'infectious and parasitic diseases'
                elif icd9 >= 140 and icd9 <= 239:
                    return 'neoplasms'
                elif icd9 >= 240 and icd9 <= 279:
                    return 'diabetes, endocrine and metabolic disorders'
                elif icd9 >= 280 and icd9 <= 289:
                    return 'other diseases/conditions'
                elif icd9 >= 290 and icd9 <= 319:
                    return 'mental disorders'
                elif icd9 >= 320 and icd9 <= 389:
                    return 'other diseases/conditions'
                elif icd9 >= 390 and icd9 <= 459:
                    return 'circulatory system diseases'
                elif icd9 >= 460 and icd9 <= 519:
                    return 'respiratory system diseases'
                elif icd9 >= 520 and icd9 <= 579:
                    return 'digestive system diseases'
                elif icd9 >= 580 and icd9 <= 629:
                    return 'genitourinary system diseases'
                elif icd9 >= 630 and icd9 <= 679:
                    return 'other diseases/conditions'
                elif icd9 >= 680 and icd9 <= 709:
                    return 'skin diseases'
                elif icd9 >= 710 and icd9 <= 739:
                    return 'musculoskeletal disorders'
                elif icd9 >= 740 and icd9 <= 759:
                    return 'other diseases/conditions'
                elif icd9 >= 760 and icd9 <= 779:
                    return 'perinatal conditions'
                elif icd9 >= 780 and icd9 <= 799:
                    return 'uncertain conditions'
                elif icd9 >= 800 and icd9 <= 999:
                    return 'injury and poisoning'
                elif icd9 == 0:
                    return 'None'
        else:
            return np.nan
    
    columns_to_map = ['primary_diagnosis', 'secondary_diagnosis', 'additional_diagnosis']
    for col in columns_to_map:
        df[col] = df[col].apply(icd9_to_category)
    
    return df

def create_diagnosis_columns(train, X_test):
    # Step 1: Generate unique diagnosis values
    diagnosis_values = train['primary_diagnosis'].unique().tolist() + \
                       train['secondary_diagnosis'].unique().tolist() + \
                       train['additional_diagnosis'].unique().tolist()
    diagnosis_values = list(set(diagnosis_values))
    
    # Step 2: Create binary columns for each unique diagnosis value in train
    for i in diagnosis_values:
        train['diagnosis_' + i] = (train['primary_diagnosis'].str.contains(i) | 
                                    train['secondary_diagnosis'].str.contains(i) | 
                                    train['additional_diagnosis'].str.contains(i)).astype(int)
    
    # Drop original diagnosis columns from train
    train.drop(['primary_diagnosis', 'secondary_diagnosis', 'additional_diagnosis'], axis=1, inplace=True)
    
    # Step 3: Create binary columns for each unique diagnosis value in X_test
    for i in diagnosis_values:
        X_test['diagnosis_' + i] = (X_test['primary_diagnosis'].str.contains(i) | 
                                    X_test['secondary_diagnosis'].str.contains(i) | 
                                    X_test['additional_diagnosis'].str.contains(i)).astype(int)
    
    # Drop original diagnosis columns from X_test
    X_test.drop(['primary_diagnosis', 'secondary_diagnosis', 'additional_diagnosis'], axis=1, inplace=True)
    
    return train, X_test


In [56]:
def map_categorical_to_numeric(train, X_test, target_binary=None):
    # Mapping for 'gender'
    train['gender'] = train['gender'].map({'Male': 0, 'Female': 1})
    X_test['gender'] = X_test['gender'].map({'Male': 0, 'Female': 1})
    
    # Mapping for 'change_in_meds_during_hospitalization'
    train['change_in_meds_during_hospitalization'] = train['change_in_meds_during_hospitalization'].map({'No': 0, 'Ch': 1})
    X_test['change_in_meds_during_hospitalization'] = X_test['change_in_meds_during_hospitalization'].map({'No': 0, 'Ch': 1})
    
    # Mapping for 'prescribed_diabetes_meds'
    train['prescribed_diabetes_meds'] = train['prescribed_diabetes_meds'].map({'No': 0, 'Yes': 1})
    X_test['prescribed_diabetes_meds'] = X_test['prescribed_diabetes_meds'].map({'No': 0, 'Yes': 1})
    
    # Mapping for target_binary (if provided)
    if target_binary is not None:
        target_binary = target_binary.map({'No': 0, 'Yes': 1})
    
    return train, X_test, target_binary

In [57]:
def label_encode_admission_type(train, X_test):
    # Define the mapping based on the specified order
    admission_type_mapping = {'Emergency': 1, 'Urgent': 2, 'Elective': 3, 'Newborn': 4, 'Trauma Center': 5}
    
    # Apply the mapping to 'admission_type' column in train and X_test
    train['admission_type'] = train['admission_type'].map(admission_type_mapping)
    X_test['admission_type'] = X_test['admission_type'].map(admission_type_mapping)
    
    return train, X_test

In [58]:
def map_admission_source(train, X_test):
    # Define the mapping function for 'admission_source' column
    map_func = lambda x: 'Transfer from Another Health Facility' if 'Transfer' in x else x
    
    # Apply the mapping function to 'admission_source' column in train and X_test
    train['admission_source'] = train['admission_source'].map(map_func)
    X_test['admission_source'] = X_test['admission_source'].map(map_func)
    
    return train, X_test

In [59]:
def one_hot_encode_and_combine(train, X_test):
    # Step 1: Identify categorical columns
    obj_cols = train.select_dtypes(include=['object']).columns.tolist()
    
    # Step 2: Initialize OneHotEncoder
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    # Step 3: Fit and transform on train
    X_train_encoded_bm = encoder.fit_transform(train[obj_cols])
    
    # Step 4: Transform X_test
    X_test_encoded_bm = encoder.transform(X_test[obj_cols])
    
    # Step 5: Get feature names after encoding
    encoded_columns = encoder.get_feature_names_out(obj_cols)
    
    # Step 6: Create DataFrames for encoded features
    X_train_encoded_bm_df = pd.DataFrame(X_train_encoded_bm, columns=encoded_columns, index=train.index)
    X_test_encoded_bm_df = pd.DataFrame(X_test_encoded_bm, columns=encoded_columns, index=X_test.index)
    
    # Step 7: Drop original categorical columns from train and X_test
    X_train_num = train.drop(obj_cols, axis=1)
    X_test_num = X_test.drop(obj_cols, axis=1)
    
    # Concatenate numeric and encoded categorical DataFrames
    train = pd.concat([X_train_num, X_train_encoded_bm_df], axis=1)
    X_test = pd.concat([X_test_num, X_test_encoded_bm_df], axis=1)
    
    return train, X_test

In [60]:
def drop_patient_id_column(train, X_test):
    # Drop 'patient_id' column from train and X_test
    train.drop('patient_id', axis=1, inplace=True)
    X_test.drop('patient_id', axis=1, inplace=True)
    
    return train, X_test

In [61]:
def scale_numerical_columns(train, X_test):
    # Step 1: Identify numerical columns
    numerical = train.select_dtypes(include=np.number).columns.tolist()
    
    # Step 2: Count unique values for each numerical column
    cat = {col: train[col].nunique() for col in numerical}
    
    # Step 3: Categorize columns into non-binary and binary
    non_binary = [col for col in cat if cat[col] != 2]
    binary = [col for col in cat if cat[col] == 2]
    
    # Step 4: Initialize RobustScaler
    scaler = RobustScaler()
    
    # Step 5: Fit and transform non-binary columns on train, and transform on X_test
    train.loc[:, non_binary] = scaler.fit_transform(train[non_binary])
    X_test.loc[:, non_binary] = scaler.transform(X_test[non_binary])
    
    return train, X_test

In [62]:
def select_features_with_models(train, X_test, target_binary):
    # Step 1: Random Forest Feature Importance
    rf_model = RandomForestClassifier(class_weight='balanced', random_state=42, max_depth=6, n_jobs=-1)
    rf_model.fit(train, target_binary)
    
    feature_importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': train.columns, 'Importance': feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    sorted_indices = np.argsort(feature_importances)[::-1]
    top_30_features = sorted_indices[:30]
    selected_features = train.columns[top_30_features]
    
    selected = {}
    for i in selected_features:
        selected[i] = 1
    
    # Step 2: RFE with Random Forest took more than 10 minutes to run and it always returned the following 2 features
    selected_features = ['inpatient_visits_in_previous_year', 'hospital_visits']
    for i in selected_features:
        if i in selected.keys():
            selected[i] += 1
        else:
            selected[i] = 1
    
    # Step 3: Lasso Feature Selection
    reg = LassoCV().fit(train, target_binary)
    coef = pd.Series(reg.coef_, index=train.columns)
    print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")
    
    lasso_features = coef[coef != 0].index.tolist()
    for i in lasso_features:
        if i in selected.keys():
            selected[i] += 1
        else:
            selected[i] = 1
    
    # Step 4: Select features to keep
    keep = [i for i in selected.keys() if selected[i] >= 2]
    
    # Step 5: Subset train and X_test DataFrames
    final_train = train[keep]
    final_X_test = X_test[keep]
    
    return final_train, final_X_test

In [63]:
train, target_multiclass, target_binary = prepare_data(df)

train, X_test = process_data(train, X_test)

train = clean_data(train)
X_test = clean_data(X_test)

train = process_age_column(train)
X_test = process_age_column(X_test)

train, target_binary = drop_newborn_outliers(train, target_binary)
X_test = drop_newborn_outliers1(X_test)

train, X_test = fill_missing_age(train, X_test)

train, X_test = fill_missing_race(train, X_test)

train, X_test = impute_missing_values(train, X_test)

train = map_icd9_to_category(train)
X_test = map_icd9_to_category(X_test)

train, X_test = create_diagnosis_columns(train, X_test)

train, X_test,target_binary = map_categorical_to_numeric(train, X_test, target_binary)

train, X_test = label_encode_admission_type(train, X_test)

train, X_test = map_admission_source(train, X_test)

train, X_test = one_hot_encode_and_combine(train, X_test)

train, X_test = drop_patient_id_column(train, X_test)

train, X_test = scale_numerical_columns(train, X_test)

final_train, final_X_test = select_features_with_models(train, X_test, target_binary)

  train.loc[:, non_binary] = scaler.fit_transform(train[non_binary])
  X_test.loc[:, non_binary] = scaler.transform(X_test[non_binary])


Lasso picked 61 variables and eliminated the other 397 variables


## Model training

In [71]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [65]:
seed = 1

X_train, X_val, y_train, y_val = train_test_split(final_train, target_binary, test_size=0.3, random_state=1)
X_train.rename(columns={"medication_['insulin']":"medication_insulin"}, inplace=True)
X_val.rename(columns={"medication_['insulin']":"medication_insulin"}, inplace=True)

In [66]:
model = LogisticRegression(max_iter = 200)
model.fit(X_train, y_train)

# Evaluate the model
y_pred_lr = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_lr)
conf_matrix = confusion_matrix(y_val, y_pred_lr)
class_report = classification_report(y_val, y_pred_lr)

# Print the results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8875058493214787
Confusion Matrix:
[[18874   142]
 [ 2262    92]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     19016
           1       0.39      0.04      0.07      2354

    accuracy                           0.89     21370
   macro avg       0.64      0.52      0.51     21370
weighted avg       0.84      0.89      0.84     21370



In [67]:
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred_dt = dt_model.predict(X_val)
accuracy_dt = accuracy_score(y_val, y_pred_dt)
conf_matrix_dt = confusion_matrix(y_val, y_pred_dt)
class_report_dt = classification_report(y_val, y_pred_dt)

# Print the results
print(f"Decision Tree Accuracy: {accuracy_dt}")
print("Decision Tree Confusion Matrix:")
print(conf_matrix_dt)
print("Decision Tree Classification Report:")
print(class_report_dt)

Decision Tree Accuracy: 0.8099204492278895
Decision Tree Confusion Matrix:
[[16796  2220]
 [ 1842   512]]
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     19016
           1       0.19      0.22      0.20      2354

    accuracy                           0.81     21370
   macro avg       0.54      0.55      0.55     21370
weighted avg       0.82      0.81      0.82     21370



In [72]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Instantiate the Random Forest classifier
rf_model = RandomForestClassifier(random_state=1)

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Step 5: Evaluate the best model
y_pred_best_rf = best_rf_model.predict(X_val)
accuracy_best_rf = accuracy_score(y_val, y_pred_best_rf)
conf_matrix_best_rf = confusion_matrix(y_val, y_pred_best_rf)
class_report_best_rf = classification_report(y_val, y_pred_best_rf)

# Print the results
print(f"Best Parameters: {best_params}")
print(f"Best Random Forest Accuracy: {accuracy_best_rf}")
print("Best Random Forest Confusion Matrix:")
print(conf_matrix_best_rf)
print("Best Random Forest Classification Report:")
print(class_report_best_rf)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best Random Forest Accuracy: 0.8891904539073467
Best Random Forest Confusion Matrix:
[[18952    64]
 [ 2304    50]]
Best Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     19016
           1       0.44      0.02      0.04      2354

    accuracy                           0.89     21370
   macro avg       0.67      0.51      0.49     21370
weighted avg       0.84      0.89      0.84     21370



In [69]:
svc_model = LinearSVC(max_iter = 5000, random_state=1)
svc_model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred_svc= svc_model.predict(X_val)
accuracy_svc = accuracy_score(y_val, y_pred_svc)
conf_matrix_svc = confusion_matrix(y_val, y_pred_svc)
class_report_svc = classification_report(y_val, y_pred_svc)

# Print the results
print(f"SVC Accuracy: {accuracy_svc}")
print("SVC Confusion Matrix:")
print(conf_matrix_svc)
print("SVC Classification Report:")
print(class_report_svc)



SVC Accuracy: 0.8890500701918578
SVC Confusion Matrix:
[[18943    73]
 [ 2298    56]]
SVC Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     19016
           1       0.43      0.02      0.05      2354

    accuracy                           0.89     21370
   macro avg       0.66      0.51      0.49     21370
weighted avg       0.84      0.89      0.84     21370





In [74]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Instantiate the XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='logloss')

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_xgb_model = grid_search.best_estimator_

# Step 5: Evaluate the best model
y_pred_best_xgb = best_xgb_model.predict(X_val)
accuracy_best_xgb = accuracy_score(y_val, y_pred_best_xgb)
conf_matrix_best_xgb = confusion_matrix(y_val, y_pred_best_xgb)
class_report_best_xgb = classification_report(y_val, y_pred_best_xgb)

# Print the results
print(f"Best Parameters: {best_params}")
print(f"Best XGBoost Accuracy: {accuracy_best_xgb}")
print("Best XGBoost Confusion Matrix:")
print(conf_matrix_best_xgb)
print("Best XGBoost Classification Report:")
print(class_report_best_xgb)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best XGBoost Accuracy: 0.8894244267664951
Best XGBoost Confusion Matrix:
[[18962    54]
 [ 2309    45]]
Best XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     19016
           1       0.45      0.02      0.04      2354

    accuracy                           0.89     21370
   macro avg       0.67      0.51      0.49     21370
weighted avg       0.84      0.89      0.84     21370

