In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedShuffleSplit
import math

In [None]:
feature_imp_ohe = pd.read_csv('/content/drive/MyDrive/Indian_bank_data/rf_importances.csv')
feature_imp_label = pd.read_csv('/content/drive/MyDrive/Indian_bank_data/rf_importances_label.csv')

train_ohe = pd.read_csv('/content/drive/MyDrive/Indian_bank_data/train_tree.csv')
train_ohe = train_ohe[~train_ohe["Approved_Flag"].isna()]
train_label = pd.read_csv('/content/drive/MyDrive/Indian_bank_data/train_label.csv')
train_label = train_label[~train_label["Approved_Flag"].isna()]
train_lr = pd.read_csv('/content/drive/MyDrive/Indian_bank_data/train_lr.csv')
train_lr = train_lr[~train_lr["Approved_Flag"].isna()]

test_ohe = pd.read_csv('/content/drive/MyDrive/Indian_bank_data/test_tree.csv')
test_label = pd.read_csv('/content/drive/MyDrive/Indian_bank_data/test_label.csv')
test_ohe = test_ohe[~test_ohe["Approved_Flag"].isna()]
test_label = test_label[~test_label["Approved_Flag"].isna()]
test_lr = pd.read_csv('/content/drive/MyDrive/Indian_bank_data/test_lr.csv')
test_lr = test_lr[~test_lr["Approved_Flag"].isna()]

test_set_index = pd.read_csv('/content/drive/MyDrive/Indian_bank_data/test_set_index.csv')
test_set_index = test_set_index["index"].to_list()

test_ohe_100 = test_ohe.loc[test_set_index]
test_label_100 = test_label.loc[test_set_index]
test_lr_100 = test_lr.loc[test_set_index]

In [None]:
def evaluate_models_with_tuning(
    train_ohe,
    train_label,
    train_lr,
    test_ohe_100,
    test_label_100,
    test_lr_100,
    sizes,
    random_state=42,
    hyperparameter_tuning=False
):
    results = []

    # Define hyperparameter grids
    param_grids = {
        "RandomForest": {
            "n_estimators": [10, 50, 100, 200],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
        },
        "DecisionTree": {
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
        },
        "XGBoost": {
            "n_estimators": [50, 100, 200],
            "max_depth": [3, 5, 10],
            "learning_rate": [0.01, 0.1, 0.2],
            "subsample": [0.6, 0.8, 1.0],
        },
    }

    # Iterate over training sizes
    for size in sizes:
        # Sampling training data
        stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=len(train_ohe) - size, random_state=random_state)
        train_idx, _ = next(stratified_split.split(train_ohe, train_ohe["Approved_Flag"]))

        sampled_train_ohe = train_ohe.iloc[train_idx]

        stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=len(train_lr) - size, random_state=random_state)
        train_idx, _ = next(stratified_split.split(train_lr, train_lr["Approved_Flag"]))
        sampled_train_lr = train_lr.iloc[train_idx]
        # sampled_indices_ohe = train_ohe.sample(size).index
        # sampled_train_ohe = train_ohe.loc[sampled_indices_ohe]
        # sampled_indices_lr = train_lr.sample(size).index
        # sampled_train_lr = train_lr.loc[sampled_indices_lr]

        X_train_tree = sampled_train_ohe.drop(columns="Approved_Flag")
        y_train_tree = sampled_train_ohe["Approved_Flag"]
        X_train_lr = sampled_train_lr.drop(columns="Approved_Flag")
        y_train_lr = sampled_train_lr["Approved_Flag"]

        X_test_tree = test_ohe_100.drop(columns="Approved_Flag")
        y_test_tree = test_label_100["Approved_Flag"]
        X_test_lr = test_lr_100.drop(columns="Approved_Flag")
        y_test_lr = test_lr_100["Approved_Flag"]

        # Models
        models_tree = {
            "RandomForest": RandomForestClassifier(),
            "DecisionTree": DecisionTreeClassifier(max_depth=int(math.log2(len(train_idx)))),
            "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        }
        model_lr = LogisticRegression(max_iter=1000)

        # Train and evaluate tree-based models
        for model_name, model in models_tree.items():
            if hyperparameter_tuning:
                # Perform RandomizedSearchCV
                search = RandomizedSearchCV(
                    estimator=model,
                    param_distributions=param_grids[model_name],
                    n_iter=10,
                    scoring="roc_auc",
                    cv=3,
                    random_state=42,
                    n_jobs=-1
                )
                search.fit(X_train_tree, y_train_tree)
                model = search.best_estimator_
            else:
                model.fit(X_train_tree, y_train_tree)

            preds = model.predict(X_test_tree)
            if hasattr(model, "predict_proba"):
                probs = model.predict_proba(X_test_tree)
                if probs.shape[1] > 1:
                    probs = probs[:, 1]  # Take probabilities of the positive class
                else:
                    probs = probs[:, 0]  # Single column output
            else:
                # Use predictions as probabilities for models without `predict_proba`
                probs = preds
            print(model_name)
            print("preds")
            print(preds)
            print("probs")
            print(probs)
            results.append({
                "Model": model_name,
                "Train Size": size,
                "F1": f1_score(y_test_tree, preds),
                "ROCAUC": roc_auc_score(y_test_tree, probs),
                "PRAUC": average_precision_score(y_test_tree, probs)
            })

        # Train and evaluate logistic regression
        model_lr.fit(X_train_lr, y_train_lr)
        preds = model_lr.predict(X_test_lr)
        probs = model_lr.predict_proba(X_test_lr)[:, 1]  # Logistic regression always outputs two classes
        print("LR")
        print("preds")
        print(preds)
        print("probs")
        print(probs)
        results.append({
            "Model": "LogisticRegression",
            "Train Size": size,
            "F1": f1_score(y_test_lr, preds),
            "ROCAUC": roc_auc_score(y_test_lr, probs),
            "PRAUC": average_precision_score(y_test_lr, probs)
        })

    return pd.DataFrame(results)


In [None]:
# Initialize an empty list to store results
all_results = []
train_sizes = [128]
# Define number of repetitions
n_repeats = 10

# Run evaluation 10 times for both without and with hyperparameter tuning
for i in range(n_repeats):
    # Evaluate without hyperparameter tuning
    results_no_tuning = evaluate_models_with_tuning(
        train_ohe,
        train_label,
        train_lr,
        test_ohe_100,
        test_label_100,
        test_lr_100,
        train_sizes,
        random_state=i,
        hyperparameter_tuning=False
    )
    # Add iteration number and tuning type
    results_no_tuning["Iteration"] = i + 1
    results_no_tuning["Tuning"] = "No"

    # Evaluate with hyperparameter tuning
    results_with_tuning = evaluate_models_with_tuning(
        train_ohe,
        train_label,
        train_lr,
        test_ohe_100,
        test_label_100,
        test_lr_100,
        train_sizes,
        random_state=i,
        hyperparameter_tuning=True
    )
    # Add iteration number and tuning type
    results_with_tuning["Iteration"] = i + 1
    results_with_tuning["Tuning"] = "Yes"

    # Append both results to the list
    all_results.append(results_no_tuning)
    all_results.append(results_with_tuning)

# Combine all results into a single DataFrame
final_results = pd.concat(all_results, ignore_index=True)

RandomForest
preds
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
probs
[0.07 0.15 0.07 0.21 0.09 0.14 0.18 0.1  0.15 0.07 0.12 0.09 0.07 0.07
 0.17 0.13 0.1  0.15 0.21 0.21 0.1  0.17 0.12 0.09 0.09 0.12 0.1  0.14
 0.12 0.04 0.14 0.18 0.13 0.25 0.1  0.12 0.25 0.2  0.1  0.06 0.15 0.18
 0.08 0.07 0.14 0.11 0.14 0.12 0.08 0.15 0.15 0.16 0.18 0.1  0.18 0.1
 0.16 0.1  0.13 0.15 0.24 0.16 0.06 0.07 0.06 0.33 0.12 0.09 0.09 0.14
 0.2  0.16 0.29 0.11 0.11 0.04 0.14 0.09 0.2  0.14 0.07 0.12 0.12 0.11
 0.13 0.13 0.15 0.08 0.07 0.17 0.1  0.12 0.11 0.19 0.08 0.12 0.09 0.22
 0.22 0.13]
DecisionTree
preds
[0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

Parameters: { "use_label_encoder" } are not used.



XGBoost
preds
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
probs
[0.06078685 0.02487194 0.13593777 0.07653569 0.00520674 0.04359728
 0.16077524 0.20728843 0.00683431 0.00146924 0.05650834 0.0067746
 0.01422152 0.00327899 0.01454899 0.13898137 0.2553521  0.05789257
 0.07634473 0.02634259 0.05429378 0.03148732 0.03050511 0.03584094
 0.00493483 0.74746805 0.02799622 0.01468639 0.00863943 0.00148907
 0.17073953 0.03251158 0.00514818 0.11814272 0.01085432 0.01360912
 0.7670711  0.03388792 0.0307399  0.03005156 0.03509373 0.01869144
 0.00881608 0.01572339 0.0164246  0.00238059 0.04603942 0.02500979
 0.01662603 0.00566351 0.05272297 0.14639817 0.04646521 0.00455176
 0.00238495 0.007387   0.14076935 0.00269493 0.4727828  0.01304678
 0.01188248 0.05392817 0.08409036 0.00382513 0.00991333 0.18322629
 0.00600702 0.0075989  0.0025959  0.0112

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR
preds
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0.]
probs
[5.54189778e-02 5.12019708e-02 6.38822718e-02 8.00336922e-01
 3.57801449e-02 2.10360302e-02 5.25566775e-02 2.82547956e-03
 4.95529315e-02 1.11055759e-01 1.00476376e-03 3.54933971e-02
 7.36014994e-03 3.82496895e-02 1.70084201e-03 1.03545331e-01
 7.03048374e-02 1.62260189e-01 1.04347337e-01 2.02043112e-02
 6.37552058e-02 0.00000000e+00 1.40075209e-01 3.60770195e-02
 7.61116161e-02 1.08686054e-01 1.83270828e-01 5.51258087e-01
 0.00000000e+00 7.31139715e-03 1.16455734e-01 3.42654316e-03
 1.38726600e-02 3.00570564e-03 1.95143868e-01 2.35447406e-03
 0.00000000e+00 9.82304243e-01 7.42792973e-02 3.47472863e-02
 1.68054651e-01 2.07319441e-01 5.27087984e-02 5.14701147e-02
 1.05356

  _data = np.array(data, dtype=dtype, copy=copy,


RandomForest
preds
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
probs
[0.025      0.14141026 0.16969697 0.09166667 0.10833333 0.09166667
 0.075      0.11641026 0.09469697 0.025      0.10833333 0.09353147
 0.18444056 0.08141026 0.09166667 0.05833333 0.06666667 0.1582684
 0.14141026 0.0952381  0.05833333 0.11333333 0.11833333 0.15110723
 0.13141026 0.085      0.03333333 0.22969697 0.17212121 0.08141026
 0.14141026 0.09166667 0.08141026 0.19545455 0.11833333 0.08141026
 0.30878788 0.25142857 0.1018648  0.09807692 0.24015152 0.09333333
 0.05641026 0.06136364 0.18878788 0.16833333 0.19829337 0.10378788
 0.12136364 0.10833333 0.09166667 0.09166667 0.15166667 0.10833333
 0.13141026 0.08141026 0.05641026 0.075      0.025      0.14141026


Parameters: { "use_label_encoder" } are not used.



XGBoost
preds
[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
probs
[1.55033199e-02 4.30619046e-02 4.56848405e-02 3.91331650e-02
 2.27866173e-02 5.75582087e-02 1.82222411e-01 9.13804293e-01
 1.19823059e-02 7.38342002e-04 1.82067323e-02 1.89872403e-02
 2.12668460e-02 1.62147265e-03 1.07540220e-01 1.94319133e-02
 4.14486289e-01 3.39174159e-02 1.10472627e-01 5.58535978e-02
 1.33617464e-02 8.89917463e-03 9.36263707e-03 7.77330026e-02
 3.19632888e-03 5.08383393e-01 8.17156583e-03 4.97648632e-03
 4.23295610e-03 1.19648105e-03 1.85808495e-01 2.07605064e-02
 1.76227314e-03 2.02797383e-01 4.40348163e-02 1.71721131e-02
 8.69971693e-01 8.02616179e-02 1.21586464e-01 1.37378350e-02
 1.10164374e-01 1.90345868e-02 6.29207818e-03 5.64402482e-03
 9.38713457e-03 7.18265306e-03 6.92014620e-02 1.13960057e-01
 1.70654450e-02 6.72848755e-03 2.75956243e-0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR
preds
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0.]
probs
[5.54189778e-02 5.12019708e-02 6.38822718e-02 8.00336922e-01
 3.57801449e-02 2.10360302e-02 5.25566775e-02 2.82547956e-03
 4.95529315e-02 1.11055759e-01 1.00476376e-03 3.54933971e-02
 7.36014994e-03 3.82496895e-02 1.70084201e-03 1.03545331e-01
 7.03048374e-02 1.62260189e-01 1.04347337e-01 2.02043112e-02
 6.37552058e-02 0.00000000e+00 1.40075209e-01 3.60770195e-02
 7.61116161e-02 1.08686054e-01 1.83270828e-01 5.51258087e-01
 0.00000000e+00 7.31139715e-03 1.16455734e-01 3.42654316e-03
 1.38726600e-02 3.00570564e-03 1.95143868e-01 2.35447406e-03
 0.00000000e+00 9.82304243e-01 7.42792973e-02 3.47472863e-02
 1.68054651e-01 2.07319441e-01 5.27087984e-02 5.14701147e-02
 1.05356

Parameters: { "use_label_encoder" } are not used.



XGBoost
preds
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
probs
[4.91494268e-01 1.11091612e-02 9.78997070e-03 2.68411815e-01
 6.62212726e-03 6.30483404e-02 1.59455210e-01 4.43311920e-03
 5.57948351e-02 6.83723271e-01 9.80056152e-02 1.93099361e-02
 7.06301108e-02 9.69069172e-03 1.40572502e-03 2.15162989e-03
 8.91415123e-03 9.23804753e-03 6.08229125e-03 9.69880819e-03
 8.35751742e-03 1.34903220e-02 4.93953377e-02 6.62148893e-01
 2.54352927e-01 1.30947500e-01 1.20498650e-01 1.82396322e-01
 2.13855371e-01 7.29857236e-02 7.41784871e-02 6.23708844e-01
 1.05761595e-01 7.20600665e-01 9.92151424e-02 8.93766209e-02
 6.38210922e-02 4.12816135e-03 7.57905841e-02 1.00953378e-01
 2.29436100e-01 3.12486663e-02 4.35725190e-02 3.10363710e-01
 8.62779468e-02 8.33143946e-03 9.29373056e-02 1.05534256e-01
 5.74134104e-02 2.55049746e-02 2.71607369e-0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR
preds
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0.]
probs
[5.79395623e-02 2.93117625e-01 1.24004254e-01 7.71029490e-02
 6.10107249e-04 4.30707010e-04 3.94660095e-02 2.72724219e-13
 3.34041767e-04 1.61914096e-01 1.07299595e-10 1.12851802e-02
 3.41029055e-04 5.47878542e-02 2.42399488e-06 9.92398358e-04
 1.84609973e-01 3.07197744e-01 2.72283061e-01 4.99775330e-01
 5.41100428e-02 0.00000000e+00 2.39488590e-01 5.49353189e-02
 1.25819401e-01 2.46154830e-01 3.61978541e-01 1.77501605e-01
 0.00000000e+00 8.15261764e-08 2.15684389e-01 7.59511778e-04
 4.21176708e-03 1.07302697e-03 2.54163900e-01 3.98647561e-04
 0.00000000e+00 4.24111447e-02 6.64371870e-02 6.88983456e-02
 7.40639241e-02 5.74652979e-01 3.56943600e-02 5.51545907e-02
 3.26692

Parameters: { "use_label_encoder" } are not used.



XGBoost
preds
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
probs
[0.28840518 0.1074774  0.16228351 0.2260627  0.0939941  0.11708596
 0.21992919 0.09230533 0.18159983 0.26258716 0.13798684 0.11161553
 0.15355022 0.10615052 0.08733156 0.11583598 0.09184466 0.16887213
 0.14296548 0.11383519 0.17252308 0.09675806 0.22633773 0.28367454
 0.20205288 0.21214116 0.17846672 0.18489994 0.09988129 0.13789865
 0.20704249 0.33210003 0.1175688  0.3353524  0.21076573 0.16975676
 0.1792304  0.09927292 0.1675     0.18247385 0.27152744 0.16073161
 0.19935961 0.30647174 0.1388723  0.12492047 0.15313058 0.18441002
 0.1865762  0.12796119 0.12619773 0.11505489 0.24599989 0.10446126
 0.08428722 0.08958064 0.18664485 0.28373355 0.16769634 0.16446155
 0.09870309 0.10951214 0.20850915 0.10061881 0.25166866 0.2126715
 0.10024969 0.31855568 0.09166112 0.1309

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR
preds
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0.]
probs
[5.79395623e-02 2.93117625e-01 1.24004254e-01 7.71029490e-02
 6.10107249e-04 4.30707010e-04 3.94660095e-02 2.72724219e-13
 3.34041767e-04 1.61914096e-01 1.07299595e-10 1.12851802e-02
 3.41029055e-04 5.47878542e-02 2.42399488e-06 9.92398358e-04
 1.84609973e-01 3.07197744e-01 2.72283061e-01 4.99775330e-01
 5.41100428e-02 0.00000000e+00 2.39488590e-01 5.49353189e-02
 1.25819401e-01 2.46154830e-01 3.61978541e-01 1.77501605e-01
 0.00000000e+00 8.15261764e-08 2.15684389e-01 7.59511778e-04
 4.21176708e-03 1.07302697e-03 2.54163900e-01 3.98647561e-04
 0.00000000e+00 4.24111447e-02 6.64371870e-02 6.88983456e-02
 7.40639241e-02 5.74652979e-01 3.56943600e-02 5.51545907e-02
 3.26692

Parameters: { "use_label_encoder" } are not used.



XGBoost
preds
[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0]
probs
[5.61467186e-03 4.32530791e-03 1.36484399e-01 3.14164311e-02
 4.06370824e-03 2.46592006e-03 8.04577861e-03 6.37863427e-02
 1.09621830e-01 7.68739323e-04 7.01683387e-03 6.20211195e-03
 5.99517882e-01 3.96240177e-03 1.84169766e-02 8.25607330e-02
 5.84099032e-02 1.25004157e-01 1.32629156e-01 7.92812463e-03
 3.91729642e-04 1.20906137e-01 1.87157292e-03 6.69355914e-02
 2.03282535e-01 1.96652301e-03 1.40578495e-02 2.91303515e-01
 1.23473909e-03 2.32218541e-02 5.81134409e-02 1.37469813e-01
 2.93491445e-02 3.35318357e-01 3.07758921e-04 2.40923762e-01
 3.40456814e-01 4.16120328e-02 3.08846869e-02 5.02983153e-01
 8.19778256e-03 2.25485768e-02 1.06729679e-01 4.22086120e-02
 1.69921741e-02 1.43115278e-02 1.06719621e-02 7.40962336e-04
 9.28134192e-03 1.01076812e-02 1.49894040e-0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR
preds
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
probs
[9.41969609e-03 1.86080058e-02 8.88324079e-01 4.00331405e-02
 3.48127543e-01 4.50573283e-02 6.97827149e-03 1.32292054e-01
 0.00000000e+00 9.36153153e-04 4.41827377e-02 2.69905733e-01
 0.00000000e+00 6.06310581e-02 2.27602573e-02 1.05871815e-01
 2.56172968e-04 1.51734242e-01 8.59989082e-02 1.30402004e-03
 2.67127370e-01 1.22077261e-03 5.04539808e-01 0.00000000e+00
 7.74875056e-02 1.03670804e-01 2.18853041e-01 0.00000000e+00
 7.36501915e-01 1.92230868e-02 3.18329815e-01 0.00000000e+00
 4.28200057e-02 0.00000000e+00 1.86308925e-01 0.00000000e+00
 8.12041779e-01 0.00000000e+00 2.76518396e-01 5.82696730e-03
 0.00000000e+00 6.46659922e-02 1.72447101e-01 5.57245000e-03
 8.46280

Parameters: { "use_label_encoder" } are not used.



XGBoost
preds
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
probs
[0.09554537 0.10832956 0.20383477 0.13987292 0.08372681 0.10865102
 0.1329094  0.14172274 0.14429867 0.0862973  0.11352452 0.11318025
 0.21836978 0.09284919 0.10684822 0.17769057 0.15662694 0.17811853
 0.1714628  0.11805993 0.08946126 0.16248673 0.11421359 0.14817706
 0.17270711 0.10455021 0.17598835 0.16544552 0.10917381 0.11703147
 0.17552266 0.13000545 0.1736392  0.16731362 0.08607656 0.13152923
 0.13590544 0.16277136 0.16102792 0.20840922 0.12000892 0.12758936
 0.16106264 0.12842919 0.12473316 0.15550065 0.12793235 0.08999237
 0.11817767 0.1012353  0.08694198 0.09704576 0.21035877 0.17877673
 0.09708738 0.1363182  0.25334486 0.1272276  0.09678625 0.102419
 0.11398769 0.15085381 0.20138748 0.09659174 0.13517372 0.09430923
 0.1289837  0.20184535 0.10791687 0.13281

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR
preds
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
probs
[9.41969609e-03 1.86080058e-02 8.88324079e-01 4.00331405e-02
 3.48127543e-01 4.50573283e-02 6.97827149e-03 1.32292054e-01
 0.00000000e+00 9.36153153e-04 4.41827377e-02 2.69905733e-01
 0.00000000e+00 6.06310581e-02 2.27602573e-02 1.05871815e-01
 2.56172968e-04 1.51734242e-01 8.59989082e-02 1.30402004e-03
 2.67127370e-01 1.22077261e-03 5.04539808e-01 0.00000000e+00
 7.74875056e-02 1.03670804e-01 2.18853041e-01 0.00000000e+00
 7.36501915e-01 1.92230868e-02 3.18329815e-01 0.00000000e+00
 4.28200057e-02 0.00000000e+00 1.86308925e-01 0.00000000e+00
 8.12041779e-01 0.00000000e+00 2.76518396e-01 5.82696730e-03
 0.00000000e+00 6.46659922e-02 1.72447101e-01 5.57245000e-03
 8.46280

Parameters: { "use_label_encoder" } are not used.



XGBoost
preds
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
probs
[9.08774149e-04 6.09151379e-04 5.85050147e-04 4.05345894e-02
 2.84819189e-03 4.11677659e-01 2.11542146e-03 4.68810834e-03
 5.66503638e-03 2.11379789e-02 1.22092582e-01 3.25970054e-02
 2.60881096e-01 7.36494362e-03 5.84841613e-03 4.44931909e-02
 8.00116360e-02 5.69426501e-03 1.15674675e-01 2.02112764e-01
 1.70254742e-03 1.18285164e-01 1.48029570e-02 4.63420554e-04
 4.84406948e-03 2.68807113e-02 9.76354539e-01 3.05227578e-01
 6.34385794e-02 4.39130468e-03 1.91079881e-02 3.71708423e-02
 6.70230165e-02 5.49137825e-04 5.57832466e-03 1.18464103e-03
 1.11787794e-02 1.42835465e-03 2.80659436e-03 3.02615133e-03
 3.65757104e-03 3.69124822e-02 4.76393662e-02 1.17217412e-03
 4.10617003e-03 1.71993382e-03 8.68153498e-02 1.08019579e-02
 7.50693376e-04 1.24098165e-02 1.36763960e-0

KeyboardInterrupt: 

In [None]:
final_results.to_csv('/content/drive/MyDrive/Indian_bank_data/traiditional_ML_results.csv', index=False)

In [None]:
final_results

Unnamed: 0,Model,Train Size,F1,ROCAUC,PRAUC,Iteration,Tuning
0,RandomForest,8,0.000000,0.316534,0.102627,1,No
1,DecisionTree,8,0.000000,0.500000,0.130000,1,No
2,XGBoost,8,0.000000,0.500000,0.130000,1,No
3,LogisticRegression,8,0.200000,0.607869,0.189695,1,No
4,RandomForest,16,0.000000,0.359859,0.110378,1,No
...,...,...,...,...,...,...,...
395,LogisticRegression,64,0.230769,0.558355,0.254256,10,Yes
396,RandomForest,128,0.000000,0.641026,0.181287,10,Yes
397,DecisionTree,128,0.263158,0.588417,0.176154,10,Yes
398,XGBoost,128,0.111111,0.497790,0.190122,10,Yes


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Run

In [None]:
final_results.groupby(["Model", "Tuning", "Train Size"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,F1,ROCAUC,PRAUC,Iteration
Model,Tuning,Train Size,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DecisionTree,No,8,0.084405,0.505747,0.137094,5.5
DecisionTree,No,16,0.05404,0.480592,0.136,5.5
DecisionTree,No,32,0.09336,0.497701,0.13387,5.5
DecisionTree,No,64,0.073252,0.485455,0.134953,5.5
DecisionTree,No,128,0.132694,0.499602,0.136777,5.5
DecisionTree,Yes,8,0.0,0.5,0.13,5.5
DecisionTree,Yes,16,0.0,0.520336,0.147548,5.5
DecisionTree,Yes,32,0.045652,0.530592,0.142386,5.5
DecisionTree,Yes,64,0.081705,0.497657,0.14158,5.5
DecisionTree,Yes,128,0.094865,0.497038,0.142852,5.5


In [None]:
def evaluate_models_with_tuning_adjusted_proportion(
    train_ohe,
    train_label,
    train_lr,
    test_ohe_100,
    test_label_100,
    test_lr_100,
    sizes,
    minority_proportions,
    random_state=42,
    hyperparameter_tuning=False
):
    from sklearn.utils import resample

    results = []

    # Define hyperparameter grids
    param_grids = {
        "RandomForest": {
            "n_estimators": [10, 50, 100, 200],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
        },
        "DecisionTree": {
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
        },
        "XGBoost": {
            "n_estimators": [50, 100, 200],
            "max_depth": [3, 5, 10],
            "learning_rate": [0.01, 0.1, 0.2],
            "subsample": [0.6, 0.8, 1.0],
        },
    }

    # Iterate over minority proportions
    for minority_proportion in minority_proportions:
        # Iterate over training sizes
        for size in sizes:
            # Separate majority and minority classes
            majority = train_ohe[train_ohe["Approved_Flag"] == 0]
            minority = train_ohe[train_ohe["Approved_Flag"] == 1]

            # Calculate the number of samples needed for each class
            n_minority = int(size * minority_proportion)
            n_majority = size - n_minority

            # Sample data
            sampled_minority = resample(minority, replace=True, n_samples=n_minority, random_state=random_state)
            sampled_majority = resample(majority, replace=True, n_samples=n_majority, random_state=random_state)
            sampled_train_ohe = pd.concat([sampled_minority, sampled_majority])

            majority = train_lr[train_lr["Approved_Flag"] == 0]
            minority = train_lr[train_lr["Approved_Flag"] == 1]

            # Calculate the number of samples needed for each class
            n_minority = int(size * minority_proportion)
            n_majority = size - n_minority

            # Sample data
            sampled_minority = resample(minority, replace=True, n_samples=n_minority, random_state=random_state)
            sampled_majority = resample(majority, replace=True, n_samples=n_majority, random_state=random_state)
            sampled_train_lr = pd.concat([sampled_minority, sampled_majority])

            # Split features and labels
            X_train_tree = sampled_train_ohe.drop(columns="Approved_Flag")
            y_train_tree = sampled_train_ohe["Approved_Flag"]
            X_train_lr = sampled_train_lr.drop(columns="Approved_Flag")
            y_train_lr = sampled_train_lr["Approved_Flag"]

            X_test_tree = test_ohe_100.drop(columns="Approved_Flag")
            y_test_tree = test_ohe_100["Approved_Flag"]
            X_test_lr = test_lr_100.drop(columns="Approved_Flag")
            y_test_lr = test_lr_100["Approved_Flag"]

            # Models
            models_tree = {
                "RandomForest": RandomForestClassifier(),
                "DecisionTree": DecisionTreeClassifier(max_depth = int(math.log2(len(X_train_tree)))),
                "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
            }
            model_lr = LogisticRegression(max_iter=1000)

            # Train and evaluate tree-based models
            for model_name, model in models_tree.items():
                if hyperparameter_tuning:
                    # Perform RandomizedSearchCV
                    search = RandomizedSearchCV(
                        estimator=model,
                        param_distributions=param_grids[model_name],
                        n_iter=10,
                        scoring="roc_auc",
                        cv=3,
                        random_state=42,
                        n_jobs=-1
                    )
                    search.fit(X_train_tree, y_train_tree)
                    model = search.best_estimator_
                else:
                    model.fit(X_train_tree, y_train_tree)

                preds = model.predict(X_test_tree)
                if hasattr(model, "predict_proba"):
                    probs = model.predict_proba(X_test_tree)
                    if probs.shape[1] > 1:
                        probs = probs[:, 1]  # Take probabilities of the positive class
                    else:
                        probs = probs[:, 0]  # Single column output
                else:
                    # Use predictions as probabilities for models without `predict_proba`
                    probs = preds
                print(model_name)
                print("preds")
                print(preds)
                print("probs")
                print(probs)
                results.append({
                    "Model": model_name,
                    "Train Size": size,
                    "Class 1 Proportion": minority_proportion,
                    "F1_Score": f1_score(y_test_lr, preds),
                    "ROC_AUC": roc_auc_score(y_test_lr, probs),
                    "PR_AUC": average_precision_score(y_test_lr, probs)
                })

            # Train and evaluate logistic regression
            model_lr.fit(X_train_lr, y_train_lr)
            preds = model_lr.predict(X_test_lr)
            probs = model_lr.predict_proba(X_test_lr)[:, 1]  # Logistic regression always outputs two classes
            results.append({
                "Model": "LogisticRegression",
                "Train Size": size,
                "Class 1 Proportion": minority_proportion,
                "F1_Score": f1_score(y_test_lr, preds),
                "ROC_AUC": roc_auc_score(y_test_lr, probs),
                "PR_AUC": average_precision_score(y_test_lr, probs)
            })

    return pd.DataFrame(results)


In [None]:
# Initialize an empty list to store results
all_results = []
train_sizes = [32, 64, 128]
# Define number of repetitions
n_repeats = 10

# Run evaluation 10 times for both without and with hyperparameter tuning
for i in range(n_repeats):
    # Evaluate without hyperparameter tuning
    results_no_tuning = evaluate_models_with_tuning_adjusted_proportion(
        train_ohe,
        train_label,
        train_lr,
        test_ohe_100,
        test_label_100,
        test_lr_100,
        train_sizes,
        [0.3, 0.5],
        random_state=i,
        hyperparameter_tuning=False
    )
    # Add iteration number and tuning type
    results_no_tuning["Iteration"] = i + 1
    results_no_tuning["Tuning"] = "No"

    # Evaluate with hyperparameter tuning
    results_with_tuning = evaluate_models_with_tuning_adjusted_proportion(
        train_ohe,
        train_label,
        train_lr,
        test_ohe_100,
        test_label_100,
        test_lr_100,
        train_sizes,
        [0.3, 0.5],
        random_state=i,
        hyperparameter_tuning=True
    )
    # Add iteration number and tuning type
    results_with_tuning["Iteration"] = i + 1
    results_with_tuning["Tuning"] = "Yes"

    # Append both results to the list
    all_results.append(results_no_tuning)
    all_results.append(results_with_tuning)

# Combine all results into a single DataFrame
final_results = pd.concat(all_results, ignore_index=True)

RandomForest
preds
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
probs
[0.31 0.2  0.35 0.3  0.28 0.28 0.24 0.23 0.34 0.35 0.31 0.43 0.25 0.25
 0.27 0.18 0.26 0.32 0.15 0.2  0.33 0.48 0.26 0.51 0.31 0.27 0.24 0.4
 0.4  0.26 0.19 0.44 0.18 0.44 0.25 0.29 0.42 0.39 0.5  0.38 0.41 0.21
 0.16 0.33 0.41 0.4  0.4  0.47 0.47 0.23 0.24 0.18 0.32 0.25 0.27 0.33
 0.2  0.32 0.24 0.19 0.43 0.15 0.41 0.2  0.44 0.39 0.21 0.25 0.28 0.18
 0.25 0.2  0.35 0.35 0.29 0.25 0.4  0.25 0.43 0.18 0.29 0.43 0.34 0.23
 0.3  0.31 0.2  0.35 0.21 0.23 0.32 0.24 0.29 0.18 0.3  0.47 0.19 0.3
 0.39 0.42]
DecisionTree
preds
[1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1.
 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 

Parameters: { "use_label_encoder" } are not used.



XGBoost
preds
[0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0
 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 1 1
 1 0 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1]
probs
[0.01985428 0.2631648  0.88553274 0.09814141 0.0868675  0.03553579
 0.45171055 0.34271798 0.97550356 0.5408521  0.05412148 0.25580734
 0.01761309 0.17904133 0.00511659 0.46697593 0.5595571  0.26518604
 0.00705142 0.02566805 0.40820596 0.86894166 0.03063422 0.7174472
 0.01911821 0.05711358 0.02563405 0.9779154  0.64799273 0.00449616
 0.00680419 0.37040442 0.01491363 0.7608486  0.02188049 0.12889086
 0.19968511 0.50015074 0.28922176 0.61531365 0.7365059  0.41347548
 0.0079863  0.01970714 0.56972164 0.8727063  0.35185346 0.33678094
 0.4550718  0.18147837 0.04938785 0.00405913 0.2304483  0.05074798
 0.02913592 0.70297027 0.01687947 0.0161711  0.08956963 0.04140698
 0.5408521  0.13923284 0.54468566 0.00650204 0.8789666  0.558757
 0.05619578 0.01430679 0.46065181 0.039195

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR
preds
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 1. 0.]
probs
[3.68444882e-02 1.57299486e-02 3.18069547e-04 1.66748831e-04
 7.68990420e-03 5.97888968e-06 1.18136899e-01 1.43893035e-03
 9.97503044e-01 5.31899930e-02 4.20823121e-04 5.15518166e-05
 1.21505414e-02 8.53337997e-03 2.68872037e-02 9.65779139e-04
 9.19525625e-01 2.11189946e-01 2.02298200e-01 2.84550987e-10
 3.11900283e-03 0.00000000e+00 1.44599538e-01 1.18944721e-01
 5.93489679e-02 3.48522671e-01 1.86734680e-01 3.26247294e-01
 0.00000000e+00 1.12286498e-03 2.05512072e-01 9.97022882e-01
 4.96674543e-04 9.99976744e-01 5.86331036e-01 9.99548684e-01
 0.00000000e+00 5.33811947e-05 4.95660777e-02 3.05236160e-03
 1.27652546e-01 1.84611131e-01 1.47080431e-04 1.20464834e-02
 9.97875

Parameters: { "use_label_encoder" } are not used.



XGBoost
preds
[0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0]
probs
[0.03091012 0.0057664  0.74870706 0.05459811 0.28224462 0.30766013
 0.02194947 0.02291574 0.08657011 0.52875453 0.4060276  0.11568608
 0.08640945 0.41608715 0.18238746 0.49995202 0.01315264 0.3706305
 0.02048416 0.21340348 0.09837752 0.07388219 0.03024046 0.52964973
 0.25109965 0.02415079 0.05644441 0.22748156 0.37234858 0.20369962
 0.03875495 0.59989375 0.01673273 0.09925216 0.02879313 0.17897747
 0.0498778  0.8255976  0.04056026 0.64984745 0.11967008 0.44073978
 0.05544854 0.05598412 0.23748404 0.7808842  0.22024965 0.34774187
 0.22575116 0.04492358 0.01805218 0.03453641 0.63726276 0.13431679
 0.26956987 0.8482185  0.00571681 0.12468947 0.20204605 0.09082624
 0.52893126 0.10740685 0.72197545 0.08165183 0.06519756 0.6728877
 0.04203279 0.85138655 0.28046307 0.11872

KeyboardInterrupt: 

In [None]:
final_results.to_csv('/content/drive/MyDrive/Indian_bank_data/traiditional_ML_adjusted_prop_results.csv', index=False)

In [None]:
final_results_stratified.groupby(["Model", "Tuning", "Train Size", "Minority Proportion"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1,ROCAUC,PRAUC,Iteration
Model,Tuning,Train Size,Minority Proportion,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DecisionTree,No,8,0.1,0.148056,0.529089,0.144992,5.5
DecisionTree,No,16,0.1,0.211426,0.543324,0.14845,5.5
DecisionTree,No,32,0.1,0.230769,0.557913,0.153254,5.5
DecisionTree,No,64,0.1,0.230769,0.557913,0.153254,5.5
DecisionTree,No,128,0.1,0.230769,0.557913,0.153254,5.5
DecisionTree,Yes,8,0.1,0.0,0.5,0.13,5.5
DecisionTree,Yes,16,0.1,0.0,0.489434,0.137423,5.5
DecisionTree,Yes,32,0.1,0.230769,0.557913,0.153254,5.5
DecisionTree,Yes,64,0.1,0.230769,0.557913,0.153254,5.5
DecisionTree,Yes,128,0.1,0.230769,0.557913,0.153254,5.5


In [None]:
train_sizes = [8, 16, 32, 64, 128]

# Evaluate without hyperparameter tuning
results_no_tuning = evaluate_models_with_tuning(
    train_ohe, train_label, train_lr, test_ohe_100, test_label_100, test_lr_100, train_sizes, hyperparameter_tuning=False
)

# Evaluate with hyperparameter tuning
results_with_tuning = evaluate_models_with_tuning(
    train_ohe, train_label, train_lr, test_ohe_100, test_label_100, test_lr_100, train_sizes, hyperparameter_tuning=True
)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.



In [None]:
results_no_tuning

Unnamed: 0,Model,Train Size,F1,ROCAUC,PRAUC
0,RandomForest,8,0.0,0.501326,0.172362
1,DecisionTree,8,0.0,0.5,0.13
2,XGBoost,8,0.0,0.5,0.13
3,LogisticRegression,8,0.065574,0.320955,0.094832
4,RandomForest,16,0.0,0.541556,0.182542
5,DecisionTree,16,0.230769,0.557913,0.153254
6,XGBoost,16,0.0,0.496021,0.157298
7,LogisticRegression,16,0.08,0.428824,0.133497
8,RandomForest,32,0.0,0.626437,0.226795
9,DecisionTree,32,0.230769,0.557913,0.153254


In [None]:
results_with_tuning

Unnamed: 0,Model,Train Size,F1,ROCAUC,PRAUC
0,RandomForest,8,0.0,0.5,0.13
1,DecisionTree,8,0.0,0.5,0.13
2,XGBoost,8,0.0,0.5,0.13
3,LogisticRegression,8,0.065574,0.320955,0.094832
4,RandomForest,16,0.0,0.662246,0.283585
5,DecisionTree,16,0.0,0.5,0.13
6,XGBoost,16,0.0,0.5,0.13
7,LogisticRegression,16,0.08,0.428824,0.133497
8,RandomForest,32,0.0,0.549956,0.247924
9,DecisionTree,32,0.230769,0.557913,0.153254
