### Import Data & Packages

In [None]:
import pandas as pd
import numpy as np
import statistics
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer 
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
# unknown and 999 should be treated as missing values
df = pd.read_csv('bank-additional-full.csv', sep = ';', na_values = ['unknown', 999])
df = df.drop(columns = ['duration']) # drop future information

# deal with campaign
# remove future information
df['ncalls'] = df['campaign'] - 1
df = df.drop(columns = ['campaign'])
print(df.shape)

X = df.drop(columns = 'y')
y = df['y'] # this is a classification problem
y = y.replace({'yes': 1, 'no': 0})

### Define Preprocessor

In [None]:
cat_ftrs = ['job', 'marital', 'default', 'housing', 'loan', 'contact', 'poutcome']

ordinal_ftrs = ['education', 'month', 'day_of_week']
ordinal_cats = [['NA', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 
                 'professional.course', 'university.degree'],
                ['mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'],
                ['mon', 'tue', 'wed', 'thu', 'fri']]

num_ftrs = ['age', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 
            'euribor3m', 'nr.employed', 'ncalls']

In [None]:
# one-hot encoder
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')), 
    ('onehot', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')),
    ('scaler', StandardScaler())])

# ordinal encoder
ordinal_transformer = Pipeline(steps = [
    ('imputer2', SimpleImputer(strategy = 'constant', fill_value = 'NA')),
    ('ordinal', OrdinalEncoder(categories = ordinal_cats)),
    ('scaler', StandardScaler())])

# standard scaler
numeric_transformer = Pipeline(steps = [
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs),
        ('ord', ordinal_transformer, ordinal_ftrs)])

# Maybe try 5 different random states?
# Maybe try 5 folds?

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

def MLpipe_kfold_reduced_features_rf(X, y, random_states, preprocessor, param_grid, n_splits = 3):
    ap_scores_random_state = {}
    mean_ap_scores = [] # in each fold
    for random_state in random_states:
        ap_scores_test = [] # ap score for each test fold
        kf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = random_state)
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
        for train_index, val_index in kf.split(X_other, y_other):
            X_train = X_other.iloc[train_index]
            y_train = y_other.iloc[train_index]
            X_val = X_other.iloc[val_index]
            y_val = y_other.iloc[val_index]

            # preprocessing
            X_prep = preprocessor.fit_transform(X_train)
            feature_names = preprocessor.get_feature_names_out()
            df_train = pd.DataFrame(data = X_prep, columns = feature_names)
            df_val = pd.DataFrame(data = preprocessor.transform(X_val), columns = feature_names)
            df_test = pd.DataFrame(data = preprocessor.transform(X_test), columns = feature_names)

            mask = df_test.isnull()
            unique_rows = np.array(np.unique(mask, axis = 0))
            all_y_test_pred = pd.DataFrame()
            #print('There are', len(unique_rows), 'unique missing value patterns')
            for i in range(len(unique_rows)):
                #print('Working on unique pattern', i)
                sub_X_test = pd.DataFrame()
                sub_y_test = pd.Series(dtype = float)
                for j in range(len(mask)):
                    row_mask = np.array(mask.iloc[j])
                    if np.array_equal(row_mask, unique_rows[i]):
                        sub_X_test = pd.concat([sub_X_test, df_test.iloc[[j]]])
                        sub_y_test = pd.concat([sub_y_test, y_test.iloc[[j]]])
                sub_X_test = sub_X_test[df_test.columns[~unique_rows[i]]]
               
                sub_X_train = pd.DataFrame()
                sub_y_train = pd.DataFrame()
                sub_X_val = pd.DataFrame()
                sub_y_val = pd.DataFrame()
                sub_X_train = df_train[df_train.columns[~unique_rows[i]]]
                sub_X_val = df_val[df_val.columns[~unique_rows[i]]]
                sub_X_train = sub_X_train.dropna()
                sub_X_val = sub_X_val.dropna()   
                sub_y_train = y_train.iloc[sub_X_train.index]
                sub_y_val = y_val.iloc[sub_X_val.index]

                # run ML algo
                # change to 1-D array
                sub_y_train_array = sub_y_train.values.ravel()
                sub_y_val_array = sub_y_val.values.ravel()
                sub_y_test_array = sub_y_test.values.ravel()
                
                ML_algo = RandomForestClassifier(random_state = 42 * random_state, n_jobs = -1)
                param_grid = param_grid
                pg = ParameterGrid(param_grid)
                best_ap = 0
                best_model = None
                #train_scores = np.zeros(len(pg))
                #val_scores = np.zeros(len(pg))
                #models = []
                
                for p in range(len(pg)):
                    params = pg[p]
                    #print('   ', params)
                    ML_algo.set_params(**params)
                    ML_algo.fit(sub_X_train, sub_y_train_array)
                    pred_prob_val = ML_algo.predict_proba(sub_X_val)[:, 1]
                    ap = average_precision_score(sub_y_val, pred_prob_val)

                    if ap > best_ap:
                        best_ap = ap
                        best_model = ML_algo
                    pred_prob_test = best_model.predict_proba(sub_X_test)[:, 1]
                    test_ap = average_precision_score(sub_y_test, pred_prob_test)
                    ap_scores_test.append(test_ap)

                    sub_y_test_pred = best_model.predict(sub_X_test)
                    models.append(ML_algo) # save the model
                    sub_y_train_pred = ML_algo.predict(sub_X_train)
                    train_scores[p] = accuracy_score(sub_y_train_array, sub_y_train_pred)
                    sub_y_val_pred = ML_algo.predict(sub_X_val)
                    val_scores[p] = accuracy_score(sub_y_val_array, sub_y_val_pred)
                    #print('   ', train_scores[p], val_scores[p])
                best_params = np.array(pg)[val_scores == np.max(val_scores)]
                #print('Best model parameters:\n', best_params)
                #print('Corresponding validation score:', np.max(val_scores))
               
                # plot train_scores and val_scores
                # param_grid is ok
                #plt.figure(figsize = (5, 3))
                #plt.plot(train_scores, label = 'Training Accuracy')
                #plt.plot(val_scores, label = 'Validation Accuracy')
                #plt.legend()
                #plt.show()
                
                ML_algo.set_params(**best_params[0])
                ML_algo.fit(sub_X_train, sub_y_train_array)
                sub_y_test_pred = ML_algo.predict(sub_X_test)
                sub_y_test_pred = pd.DataFrame(sub_y_test_pred, index = sub_y_test.index, 
                                               columns = ['sub_y_test_pred']) # convert into data frame
                all_y_test_pred = pd.concat([all_y_test_pred, sub_y_test_pred])
                    
            all_y_test_pred = all_y_test_pred.sort_index()
            y_test = y_test.sort_index()

            # test recall in one-fold
            # have n_splits scores for each random_state
            recall_test = accuracy_score(y_test, all_y_test_pred) # total accuracy
            recall_scores_test.append(recall_test)
            cm = confusion_matrix(y_test, all_y_test_pred)
            disp = ConfusionMatrixDisplay(cm, display_labels = ['Class 0', 'Class 1'])
            fig, ax = plt.subplots(figsize = (5, 3))
            disp.plot(ax = ax)
            plt.tight_layout()
            plt.show()
        # mean test recall for this random_state
        mean_recall = np.mean(recall_scores_test)
        mean_recall_scores.append(mean_recall)
        recall_scores_random_state[random_state] = mean_recall
        #print(f'Mean test recall for random state {random_state}: {mean_recall}\n')
    # mean & std recall for all random states
    overall_mean_ap = np.mean(mean_recall_scores)
    overall_std_ap = np.std(mean_recall_scores)
    #print(f'Overall mean test recall across all random states: {overall_mean_recall}')
    #print(f'Overall std test recall across all random states: {overall_std_recall}\n')
    
    return overall_mean_ap, overall_std_ap, ap_scores_random_state

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid
from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd
import numpy as np

def MLpipe_kfold_reduced_features_rf(X, y, random_states, preprocessor, param_grid, n_splits=3):
    ap_scores_random_state = {}
    mean_ap_scores = []  # Average Precision scores for each random state

    for random_state in random_states:
        ap_scores_test = []  # Store AP scores for each test fold
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

        for train_index, val_index in kf.split(X_other, y_other):
            X_train, y_train = X_other.iloc[train_index], y_other.iloc[train_index]
            X_val, y_val = X_other.iloc[val_index], y_other.iloc[val_index]

            # Preprocessing
            X_train_prep = preprocessor.fit_transform(X_train)
            feature_names = preprocessor.get_feature_names_out()
            df_train = pd.DataFrame(data=X_train_prep, columns=feature_names)
            df_val = pd.DataFrame(data=preprocessor.transform(X_val), columns=feature_names)
            df_test = pd.DataFrame(data=preprocessor.transform(X_test), columns=feature_names)

            mask = df_test.isnull()
            unique_rows = np.array(np.unique(mask, axis=0))
            all_y_test_pred = pd.DataFrame()
            
            for i in range(len(unique_rows)):
                sub_X_test, sub_y_test = pd.DataFrame(), pd.Series(dtype=float)
                for j in range(len(mask)):
                    row_mask = np.array(mask.iloc[j])
                    if np.array_equal(row_mask, unique_rows[i]):
                        sub_X_test = pd.concat([sub_X_test, df_test.iloc[[j]]])
                        sub_y_test = pd.concat([sub_y_test, y_test.iloc[[j]]])
                sub_X_test = sub_X_test[df_test.columns[~unique_rows[i]]]
               
                sub_X_train = df_train[df_train.columns[~unique_rows[i]]].dropna()
                sub_X_val = df_val[df_val.columns[~unique_rows[i]]].dropna()
                sub_y_train, sub_y_val = y_train.iloc[sub_X_train.index], y_val.iloc[sub_X_val.index]

                # Training the model
                ML_algo = RandomForestClassifier(random_state=42 * random_state, n_jobs=-1)
                pg = ParameterGrid(param_grid)
                best_ap = 0
                best_model = None

                for params in pg:
                    ML_algo.set_params(**params)
                    ML_algo.fit(sub_X_train, sub_y_train)

                    # Predict probabilities and calculate AP
                    pred_prob_val = ML_algo.predict_proba(sub_X_val)[:, 1]
                    ap = average_precision_score(sub_y_val, pred_prob_val)

                    if ap > best_ap:
                        best_ap = ap
                        best_model = ML_algo

                # Predict on test set and calculate AP
                pred_prob_test = best_model.predict_proba(sub_X_test)[:, 1]
                test_ap = average_precision_score(sub_y_test, pred_prob_test)
                ap_scores_test.append(test_ap)

                # Optional: Display confusion matrix
                sub_y_test_pred = best_model.predict(sub_X_test)
                cm = confusion_matrix(sub_y_test, sub_y_test_pred)
                disp = ConfusionMatrixDisplay(cm, display_labels=['Class 0', 'Class 1'])
                disp.plot()
        
        # Average AP score for this random state
        mean_ap = np.mean(ap_scores_test)
        mean_ap_scores.append(mean_ap)
        ap_scores_random_state[random_state] = mean_ap

    # Overall mean and std AP across all random states
    overall_mean_ap = np.mean(mean_ap_scores)
    overall_std_ap = np.std(mean_ap_scores)
    
    return overall_mean_ap, overall_std_ap, ap_scores_random_state


In [None]:
random_states = [42, 123]
param_grid = {
    'max_depth': [1, 3, 9, 12, None],
    'max_features': [0.15, 0.5, 0.75, 1, None]
}

mean_recall_rf, std_recall_rf, recall_random_state_rf = MLpipe_kfold_reduced_features_rf(X, y, random_states, 
                                                                                               preprocessor, param_grid, 
                                                                                               n_splits = 3)
print('Random forest mean recall:', mean_recall_rf)
print('Random forest std recall:', std_recall_rf)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def MLpipe_kfold_reduced_features_knn(X, y, random_states, preprocessor, param_grid, n_splits = 3):
    accuracy_scores_random_state = {}
    accuracy_scores_test = []
    mean_accuracy_scores = [] # in each fold
    for random_state in random_states:
        kf = KFold(n_splits = n_splits, shuffle = True, random_state = random_state)
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
        for train_index, val_index in kf.split(X_other, y_other):
            X_train = X_other.iloc[train_index]
            y_train = y_other.iloc[train_index]
            X_val = X_other.iloc[val_index]
            y_val = y_other.iloc[val_index]

            # preprocessing
            X_prep = preprocessor.fit_transform(X_train)
            feature_names = preprocessor.get_feature_names_out()
            df_train = pd.DataFrame(data = X_prep, columns = feature_names)
            df_val = pd.DataFrame(data = preprocessor.transform(X_val), columns = feature_names)
            df_test = pd.DataFrame(data = preprocessor.transform(X_test), columns = feature_names)

            mask = df_test.isnull()
            unique_rows = np.array(np.unique(mask, axis = 0))
            all_y_test_pred = pd.DataFrame()
            #print('There are', len(unique_rows), 'unique missing value patterns')
            for i in range(len(unique_rows)):
                #print('Working on unique pattern', i)
                sub_X_test = pd.DataFrame()
                sub_y_test = pd.Series(dtype = float)
                for j in range(len(mask)):
                    row_mask = np.array(mask.iloc[j])
                    if np.array_equal(row_mask, unique_rows[i]):
                        sub_X_test = pd.concat([sub_X_test, df_test.iloc[[j]]])
                        sub_y_test = pd.concat([sub_y_test, y_test.iloc[[j]]])
                sub_X_test = sub_X_test[df_test.columns[~unique_rows[i]]]
               
                sub_X_train = pd.DataFrame()
                sub_y_train = pd.DataFrame()
                sub_X_val = pd.DataFrame()
                sub_y_val = pd.DataFrame()
                sub_X_train = df_train[df_train.columns[~unique_rows[i]]]
                sub_X_val = df_val[df_val.columns[~unique_rows[i]]]
                sub_X_train = sub_X_train.dropna()
                sub_X_val = sub_X_val.dropna()   
                sub_y_train = y_train.iloc[sub_X_train.index]
                sub_y_val = y_val.iloc[sub_X_val.index]

                # run ML algo
                # change to 1-D array
                sub_y_train_array = sub_y_train.values.ravel()
                sub_y_val_array = sub_y_val.values.ravel()
                sub_y_test_array = sub_y_test.values.ravel()
                
                ML_algo = KNeighborsClassifier(n_jobs = -1)
                param_grid = param_grid
                pg = ParameterGrid(param_grid)
                train_scores = np.zeros(len(pg))
                val_scores = np.zeros(len(pg))
                models = []
                
                for p in range(len(pg)):
                    params = pg[p]
                    #print('   ', params)
                    ML_algo.set_params(**params)
                    ML_algo.fit(sub_X_train, sub_y_train_array)
                    models.append(ML_algo) # save the model
                    sub_y_train_pred = ML_algo.predict(sub_X_train)
                    train_scores[p] = accuracy_score(sub_y_train_array, sub_y_train_pred)
                    sub_y_val_pred = ML_algo.predict(sub_X_val)
                    val_scores[p] = accuracy_score(sub_y_val_array, sub_y_val_pred)
                    #print('   ', train_scores[p], val_scores[p])
                best_params = np.array(pg)[val_scores == np.max(val_scores)]
                #print('Best model parameters:\n', best_params)
                #print('Corresponding validation score:', np.max(val_scores))
               
                # plot train_scores and val_scores
                # param_grid is ok
                plt.figure(figsize = (5, 3))
                plt.plot(train_scores, label = 'Training Accuracy')
                plt.plot(val_scores, label = 'Validation Accuracy')
                plt.legend()
                plt.show()
                
                ML_algo.set_params(**best_params[0])
                ML_algo.fit(sub_X_train, sub_y_train_array)
                sub_y_test_pred = ML_algo.predict(sub_X_test)
                sub_y_test_pred = pd.DataFrame(sub_y_test_pred, index = sub_y_test.index, 
                                               columns = ['sub_y_test_pred']) # convert into data frame
                all_y_test_pred = pd.concat([all_y_test_pred, sub_y_test_pred])
                    
            all_y_test_pred = all_y_test_pred.sort_index()
            y_test = y_test.sort_index()

            # test accuracy in one-fold
            # have n_splits scores for each random_state
            accuracy_test = accuracy_score(all_y_test_pred, y_test) # total accuracy
            accuracy_scores_test.append(accuracy_test)
        # mean test accuracy for this random_state
        mean_accuracy = np.mean(accuracy_scores_test)
        mean_accuracy_scores.append(mean_accuracy)
        accuracy_scores_random_state[random_state] = mean_accuracy
        #print(f'Mean test accuracy for random state {random_state}: {mean_accuracy}\n')
    # mean & std accuracy for all random states
    overall_mean_accuracy = np.mean(mean_accuracy_scores)
    overall_std_accuracy = np.std(mean_accuracy_scores)
    #print(f'Overall mean test accuracy across all random states: {overall_mean_accuracy}')
    #print(f'Overall std test accuracy across all random states: {overall_std_accuracy}\n')
    
    return overall_mean_accuracy, overall_std_accuracy, accuracy_scores_random_state

In [None]:
random_states = [42, 123]
param_grid = {
    'max_depth': [1, 3, 9, 12, None],
    'max_features': [0.5, 0.75, 1, None]
}

mean_accuracy_knn, std_accuracy_knn, accuracy_random_state_knn = MLpipe_kfold_reduced_features_knn(X, y, random_states, 
                                                                                                   preprocessor, param_grid, 
                                                                                                   n_splits = 3)
print('KNN mean accuracy:', mean_accuracy_knn)
print('KNN std accuracy:', std_accuracy_knn)

In [None]:
mask = df_test.isnull()
len(mask)

In [None]:
np.array(np.unique(mask, axis = 0))

In [None]:
X.isnull().sum(axis = 0)

In [None]:
X_pdays = X.drop(columns = ['pdays'])
X_pdays.isnull().sum(axis = 0)
len(X_pdays)

In [None]:
mask = X[['num__LotFrontage','num__MasVnrArea','num__GarageYrBlt']].isnull()

unique_rows, counts = np.unique(mask, axis=0,return_counts=True)
print(unique_rows.shape) # 6 patterns, we will train 6 models (one model for each pattern)
for i in range(len(counts)):
    print(unique_rows[i],counts[i])

In [None]:
print(sub_y_train.values.reshape(-1, 1))

In [None]:
type(sub_y_train_mod)

In [None]:
type(sub_y_train)

In [None]:
sub_y_train = pd.DataFrame(data = sub_y_train)

In [None]:
clf = SVC()
clf.fit(sub_X_train, sub_y_train_mod)
sub_y_val_pre = clf.predict(sub_X_val)

In [None]:
np.reshape(np.array(y_train), (1, -1)).ravel()

In [None]:
mask = df_test.isnull()
unique_rows, counts = np.unique(mask, axis=0,return_counts=True)
print(unique_rows.shape) # 2 patterns, we will train 2 models (one model for each pattern)
for i in range(len(counts)):
    print(unique_rows[i],counts[i])
unique_rows[1]

In [None]:
sub_X_train = df_train[df_train.columns[~unique_rows[i]]]

In [None]:
sub_X_train = pd.DataFrame()
df_train[df_train.columns[~unique_rows[1]]]

In [None]:
sub_X_test[X_test.columns[~unique_rows[i]]]

In [None]:
df_test.columns[~unique_rows[0]]

In [None]:
sub_X_test = pd.DataFrame()
sub_X_test = pd.concat([sub_X_test, df_test.iloc[[1]]])
sub_X_test

In [None]:
import xgboost
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
xgb_model(X_train = df_train, Y_train = y_train, X_CV = df_val, y_CV = y_val, X_test = df_test, y_test = y_test,
          verbose = 0)

In [None]:
def xgb_model(X_train, Y_train, X_CV, y_CV, X_test, y_test, verbose=1):

    # make into row vectors to avoid an obnoxious sklearn/xgb warning
    Y_train = np.reshape(np.array(Y_train), (1, -1)).ravel()
    y_CV = np.reshape(np.array(y_CV), (1, -1)).ravel()
    y_test = np.reshape(np.array(y_test), (1, -1)).ravel()

    XGB = xgboost.XGBRegressor(n_jobs=1)
    
    # find the best parameter set
    # can change the grid if you want
    param_grid = {"learning_rate": [0.03],
                  "n_estimators": [10000],
                  "seed": [0],
                  #"reg_alpha": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
                  #"reg_lambda": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
                  "missing": [np.nan], 
                  #"max_depth": [1,3,10,30,100,],
                  "colsample_bytree": [0.9],              
                  "subsample": [0.66]}

    pg = ParameterGrid(param_grid)

    scores = np.zeros(len(pg))

    for i in range(len(pg)):
        if verbose >= 5:
            print("Param set " + str(i + 1) + " / " + str(len(pg)))
        params = pg[i]
        XGB.set_params(**params)
        eval_set = [(X_CV, y_CV)]
        XGB.fit(X_train, Y_train,
                early_stopping_rounds=50, eval_set=eval_set, verbose=False)# with early stopping
        y_CV_pred = XGB.predict(X_CV, ntree_limit=XGB.best_ntree_limit)
        scores[i] = mean_squared_error(y_CV,y_CV_pred)
    # what are the best hyper-parameter combinations based on the CV pipeline
    best_params = np.array(pg)[scores == np.max(scores)]
    if verbose >= 4:
        print('Test set max score and best parameters are:')
        print(np.max(scores))
        print(best_params)

    # test the model on the test set with the best parameter set
    XGB.set_params(**best_params[0])
    XGB.fit(X_train,Y_train,
            early_stopping_rounds=50,eval_set=eval_set, verbose=False)
    y_test_pred = XGB.predict(X_test, ntree_limit=XGB.best_ntree_limit)

    if verbose >= 1:
        print ('The MSE is:',mean_squared_error(y_test,y_test_pred))
    if verbose >= 2:
        print ('The predictions are:')
        print (y_test_pred)
    if verbose >= 3:
        print("Feature importances:")
        print(XGB.feature_importances_)

    return (mean_squared_error(y_test,y_test_pred), y_test_pred, XGB.feature_importances_)

### XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterGrid
import seaborn as sns
import matplotlib.pyplot as plt

def MLpipe_XGBoost(X, y, preprocessor, random_states):
    test_scores = []
    train_scores = []
    best_models = []
    test_sets = [] # save each random state's test set into a list

    for random_state in random_states:
        X_train, X_other, y_train, y_other = train_test_split(X, y, train_size = 0.6,
                                                              random_state = 42 * random_state)
        X_val, X_test, y_val, y_test = train_test_split(X_other, y_other, train_size = 0.5,
                                                        random_state = 42 * random_state)
        # preprocessing
        df_train = pd.DataFrame(data = preprocessor.fit_transform(X_train), 
                                columns = preprocessor.get_feature_names_out())
        df_val = pd.DataFrame(data = preprocessor.transform(X_val), 
                              columns = preprocessor.get_feature_names_out())
        df_test = pd.DataFrame(data = preprocessor.transform(X_test), 
                               columns = preprocessor.get_feature_names_out())

        # save each random state's test set into a list
        # save both the feature matrix and the target series
        df_y = pd.DataFrame(data = y_test, columns = ['y']).reset_index(drop = True)
        df_test1 = df_test.reset_index(drop = True)
        df_combined = pd.concat([df_test1, df_y], axis = 1)
        test_sets.append(df_combined)
        
        param_grid = {
            'max_depth': [1, 3, 10, 30, 100],
            'learning_rate': [0.33],
            'n_estimators': [10000],
            'seed': [0],
            'reg_alpha': [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
            'subsample': [0.66],
            'colsample_bytree': [0.9]
        }
       
        best_ap_score = 0
        best_model = None
        best_params = None

        for params in ParameterGrid(param_grid):
            clf = XGBClassifier(random_state = 42 * random_state)
            clf.set_params(**params, early_stopping_rounds = 50)
            clf.fit(df_train, y_train, eval_set = [(df_val, y_val)], 
                    verbose = False)
            y_val_pred_prob = clf.predict_proba(df_val)[:, 1]
            ap_score = average_precision_score(y_val, y_val_pred_prob)
            if ap_score > best_ap_score:
                best_ap_score = ap_score
                best_model = clf
                best_params = params
       
        best_models.append(best_model)
        train_score = average_precision_score(y_train, best_model.predict_proba(df_train)[:, 1])
        test_score = average_precision_score(y_test, best_model.predict_proba(df_test)[:, 1])
        test_scores.append(test_score)
        train_scores.append(train_score)
                            
    print(f'Mean of test scores: {np.mean(test_scores)}')
    print(f'Standard deviation of test scores: {np.std(test_scores)}')
    return best_models, test_scores, test_sets

In [None]:
random_states = [42, 123, 456, 789, 101]
best_models, test_scores, test_sets = MLpipe_XGBoost(X, y, preprocessor, random_states)

In [None]:
f = 'gain'
a = best_models[2].get_booster().get_score(importance_type = f)
sorted_a = sorted(a.items(), key = lambda x: x[1], reverse = True)
sorted_dict = dict(sorted_a)
sorted_dict