## 5819 Project -- Using BRF on Imbalanced Data

Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score

### Load Dataset from Google Drive


[link to dataset](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud?resource=download)

In [2]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/5819 Project/creditcard.csv'

Mounted at /content/drive


This dataset is stored in an Excel file

We will convert it to a pandas DataFrame in order to analyze & manipulate the data

In [3]:
# store the entire data set
df = pd.read_csv(file_path)
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of rows: 284807
Number of columns: 31


Initialize:
- Design Matrix X (the features)
- Label Vector, y (ground truth)

In [4]:
X = df.drop('Class', axis=1)
y = df['Class']

10-Fold CV -- following methodology from paper
- StratifiedKFlod --> keeps class ratio the same

In [5]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

BRF Algorithm (according to the paper):
1. for each iteration in RF, draw bootstrap sample from minority class. Randomly draw the same number of cases, with replacement, from the majority class
2. Induce a classification tree from the data to maximum size, without pruning. The tree is induced with the CART algorithm, with the following modification: At each node, instead of searching through all variables for the optimal split, only search through a set of mtry randomly selected variables
3. Repeat the two steps above for the number of times desired. Aggregate the predictions of the ensemble and make the final prediction


In [7]:
# initialize number of trees to create [STEP 3]
  # scikit=learn default is 100 (paper does not specify how many they used)
num_trees = 100

# initialize list to store all 10 reports (precision, recall, F1-score, support for each fold)
perf_metrics = []

# STEP 3 --> repeat steps 1 & 2 for the number of times desired (100)
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    # for each fold, split the data into 10 equal parts
      # train on 9 of the 10 folds
      # test on 1 of the 10 folds
      # repeat 10 times (st. every data point is used for testing exactly once)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # we train with the labels -- init labeled training dataframe
    train_data = pd.concat([X_train, y_train], axis=1)
    brf_trees = []

    for _ in range(num_trees):

        # preprocessing step: in order to create the bootstrap samples from majority & minority class
          # we can create a df for majority & minority by using the class attribute
        df_minority = train_data[train_data['Class'] == 1]
        df_majority = train_data[train_data['Class'] == 0]

        # STEP 1: for each iteration in RF, draw bootstrap sample from minority class
          # then same number of cases from majority class --> n_samples=len(df_minority)
          # with replacement --> replace=True
        boot_min = resample(df_minority, replace=True, n_samples=len(df_minority), random_state=np.random.randint(10000))
        boot_maj = resample(df_majority, replace=True, n_samples=len(df_minority), random_state=np.random.randint(10000))

        # concatenate the bootstraps --> creating an artificially balanced dataset (via down-sampling majority class)
        bootstrap_sample = pd.concat([boot_min, boot_maj])

        # initialize the design matrix and label column vector for this sample
        X_boot = bootstrap_sample.drop('Class', axis=1)
        y_boot = bootstrap_sample['Class']

        # STEP 2 --> create classification tree from data to maximum size, without pruning
          # at each node, only search through mtry random variables for optimal split
          # [we use mtry = sqrt(p) --> default convention from Breiman's original RF paper (2001)]
        tree = DecisionTreeClassifier(
            max_features='sqrt', random_state=np.random.randint(10000)
        )
        tree.fit(X_boot, y_boot)
        brf_trees.append(tree)

    # -- AT THIS POINT --
      # we have TRAINED the forest using the training dataset for this fold
      # now, we will run the TEST set through the forest and make predictions

    # preds --> model predictions for TEST set [there will be 100 predictions]
    preds = np.array([tree.predict(X_test) for tree in brf_trees])

    # y_pred --> mean prediction across all trees, then round to nearest label
    y_pred = np.round(np.mean(preds, axis=0))

    # for each fold, we will compute the performance metrics
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # compute metrics in accordance with the paper
    acc_pos = tp / (tp + fn)  # [ Acc⁺ = Recall ]
    acc_neg = tn / (tn + fp)  # [ Acc⁻ = Specificity ]
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f_measure = 2 * (precision * recall) / (precision + recall)
    g_mean = (acc_pos * acc_neg) ** 0.5
    weighted_accuracy = 0.5 * (acc_pos + acc_neg)

    # append the metrics for each fold to performance metrics list
    perf_metrics.append({
        'acc_pos': acc_pos,
        'acc_neg': acc_neg,
        'precision': precision,
        'f1': f_measure,
        'g_mean': g_mean,
        'weighted_accuracy': weighted_accuracy
    })

Now that we have measures for the performance metrics across all 10 folds, we average them and print the results as a percentage (in accordance with the paper)

In [12]:
def avg(metric):
    return np.mean([m[metric] for m in perf_metrics])

print("BRF Metrics \n----------------------------")
print(f"Acc⁺ (Recall):        {avg('acc_pos')*100:.2f}%")
print(f"Acc⁻ (Specificity):   {avg('acc_neg')*100:.2f}%")
print(f"Precision:            {avg('precision')*100:.2f}%")
print(f"F1-Score:             {avg('f1')*100:.2f}%")
print(f"G-Mean:               {avg('g_mean')*100:.2f}%")
print(f"Weighted Accuracy:    {avg('weighted_accuracy')*100:.2f}%")

BRF Metrics 
----------------------------
Acc⁺ (Recall):        89.02%
Acc⁻ (Specificity):   99.07%
Precision:            14.25%
F1-Score:             24.54%
G-Mean:               93.89%
Weighted Accuracy:    94.04%
