In [1]:
import gc
import joblib

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from metric import amex_metric
from train import CFG, seed_everything

In [2]:
df_train = pd.read_parquet("gs://leoraggio-kaggle/amex-default-prediction/data/processed/train_data.parquet")
df_test = pd.read_parquet("gs://leoraggio-kaggle/amex-default-prediction/data/processed/test_data.parquet")

In [3]:
seed_everything(CFG.seed)

In [4]:
def process_data(train, test):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    cat_features = [f"{cf}_last" for cf in cat_features]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
    # Round last float features to 2 decimal place
    num_cols = list(
        train.dtypes[(train.dtypes == "float32") | (train.dtypes == "float64")].index
    )
    num_cols = [col for col in num_cols if "last" in col]
    for col in num_cols:
        train[col + "_round2"] = train[col].round(2)
        test[col + "_round2"] = test[col].round(2)
    # Get the difference between last and mean
    num_cols = [col for col in train.columns if "last" in col]
    num_cols = [col[:-5] for col in num_cols if "round" not in col]
    for col in num_cols:
        try:
            train[f"{col}_last_mean_diff"] = train[f"{col}_last"] - train[f"{col}_mean"]
            test[f"{col}_last_mean_diff"] = test[f"{col}_last"] - test[f"{col}_mean"]
        except:
            pass

    return train, test

In [5]:
df_train, df_test = process_data(df_train, df_test)
features = [col for col in df_train.columns if col not in ["customer_ID", CFG.target]]

In [9]:
df_train.fillna(-128, inplace=True)
df_test.fillna(-128, inplace=True)

In [13]:
from bayes_opt import BayesianOptimization


def rfc_cv(
    n_estimators,
    min_samples_split,
    min_samples_leaf,
    max_features,
    data,
    targets):
    estimator = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=CFG.seed,
        class_weight="balanced",
        n_jobs=-1
    )
    print(" ")
    print("=" * 50)
    print(f"=== n_estimators: {n_estimators}")
    print(f"=== min_samples_split: {min_samples_split}")
    print(f"=== min_samples_leaf: {min_samples_leaf}")
    print(f"=== max_features: {max_features}")
    print("=" * 50)

    oof_predictions = np.zeros(len(data))
    kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(data, targets)):
        print(" ")
        print("-" * 50)
        print(f"Training fold {fold} with {len(features)} features...")
        x_train, x_val = data.iloc[train_idx], data.iloc[val_idx]
        y_train, y_val = targets.iloc[train_idx], targets.iloc[val_idx]

        # Train model
        model = estimator.fit(x_train, y_train)
        joblib.dump(
            model,
            f"../Models/rf_fold{fold}_seed{CFG.seed}.joblib",
        )
        
        # Train score
        train_pred = model.predict_proba(x_train)
        train_pred = np.array([p[1] for p in train_pred])
        train_score = amex_metric(y_train, train_pred)
        print(f"Our fold {fold} train score is {train_score}")

        # Predict validation
        val_pred = model.predict_proba(x_val)
        val_pred = np.array([p[1] for p in val_pred])
        oof_predictions[val_idx] = val_pred
        val_score = amex_metric(y_val, val_pred)
        print(f"Our fold {fold} CV score is {val_score}")
        del x_train, x_val, y_train, y_val
        gc.collect()
    
    score = amex_metric(targets, oof_predictions)
    print(f"Final Score: {score}")
    return score


def optimize_rfc(data, targets):
    """Apply Bayesian Optimization to Random Forest parameters."""

    def rfc_crossval(n_estimators, min_samples_split, min_samples_leaf, max_features):
        """Wrapper of RandomForest cross validation.
        Notice how we ensure n_estimators and min_samples_split are casted
        to integer before we pass them along. Moreover, to avoid max_features
        taking values outside the (0, 1) range, we also ensure it is capped
        accordingly.
        """
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            min_samples_leaf=int(min_samples_leaf),
            max_features=max(min(max_features, 0.999), 1e-3),
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (100, 250),
            "min_samples_split": (2, 25),
            "min_samples_leaf": (1, 25),
            "max_features": (0.1, 0.999),
        },
        random_state=CFG.seed,
        verbose=2
    )
    optimizer.maximize(n_iter=10)

    print("Final result:", optimizer.max)

In [15]:
train_sample = df_train.groupby(CFG.target).sample(frac=0.1)
x_train = train_sample[features]
y_train = train_sample[CFG.target]

optimize_rfc(x_train, y_train)

|   iter    |  target   | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
 
=== n_estimators: 107
=== min_samples_split: 4
=== min_samples_leaf: 12
=== max_features: 0.10841339381318048
 
--------------------------------------------------
Training fold 0 with 1365 features...
Our fold 0 train score is 0.9289963383501529
Our fold 0 CV score is 0.7447628522111074
 
--------------------------------------------------
Training fold 1 with 1365 features...
Our fold 1 train score is 0.9276518848027778
Our fold 1 CV score is 0.7616865556956924
 
--------------------------------------------------
Training fold 2 with 1365 features...
Our fold 2 train score is 0.9291917667666066
Our fold 2 CV score is 0.7705922401532861
 
--------------------------------------------------
Training fold 3 with 1365 features...
Our fold 3 train score is 0.9302032295839657
Our fold 3 CV score is 0.7482828360910547
 
--------------------------

In [12]:
model = RandomForestClassifier(
    n_estimators=107,
    max_features=0.1084,  # type: ignore
    min_samples_leaf=13,
    min_samples_split=5,
    class_weight="balanced",
    n_jobs=-1)


# Create a numpy array to store test predictions
test_predictions = np.zeros(len(df_test))
# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(df_train))
kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(df_train, df_train[CFG.target])):
    print(" ")
    print("-" * 50)
    print(f"Training fold {fold} with {len(features)} features...")
    x_train, x_val = df_train[features].iloc[trn_ind], df_train[features].iloc[val_ind]
    y_train, y_val = (
        df_train[CFG.target].iloc[trn_ind],
        df_train[CFG.target].iloc[val_ind],
    )
    model.fit(x_train, y_train)
    # Save best model
    joblib.dump(
        model,
        f"../Models/rf_fold{fold}_seed{CFG.seed}.pkl",
    )
    # Predict validation
    val_pred = model.predict_proba(x_val)
    val_pred = np.array([p[1] for p in val_pred])
    # Add to out of folds array
    oof_predictions[val_ind] = val_pred
    # Predict the test set
    test_pred = model.predict_proba(df_test[features])
    test_pred = np.array([p[1] for p in test_pred])
    test_predictions += test_pred / CFG.n_folds
    # Compute fold metric
    score = amex_metric(y_val, val_pred)
    print(f"Our fold {fold} CV score is {score}")
    del x_train, x_val, y_train, y_val
    gc.collect()
# Compute out of folds metric
score = amex_metric(df_train[CFG.target], oof_predictions)
print(f"Our out of folds CV score is {score}")
# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame(
    {
        "customer_ID": df_train["customer_ID"],
        "target": df_train[CFG.target],
        "prediction": oof_predictions,
    }
)
oof_df.to_csv(
    f"../data/OOF/oof_rf_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv",
    index=False,
)
# Create a dataframe to store test prediction
test_df = pd.DataFrame(
    {"customer_ID": df_test["customer_ID"], "prediction": test_predictions}
)
test_df.to_csv(
    f"../data/Predictions/test_rf_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv",
    index=False,
)

 
--------------------------------------------------
Training fold 0 with 1365 features...
Our fold 0 CV score is 0.7641236519574754
 
--------------------------------------------------
Training fold 1 with 1365 features...
Our fold 1 CV score is 0.7640760308196713
 
--------------------------------------------------
Training fold 2 with 1365 features...
Our fold 2 CV score is 0.7679898203727911
 
--------------------------------------------------
Training fold 3 with 1365 features...
Our fold 3 CV score is 0.7675994610899493
 
--------------------------------------------------
Training fold 4 with 1365 features...
Our fold 4 CV score is 0.7659287993987016
Our out of folds CV score is 0.7657208944864091
