# isolation forest implementation

## imports

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler


## data loader

In [None]:
def dataLoading(path):
    df = pd.read_csv(path)
    labels = df['class'].values
    x = df.drop(['class'], axis=1).values
    return x, labels

## main code usiing sklearn iforest

In [None]:
# this function run isolation forest on each csv file and report metrics
# it splits data, scales with robust scaler, trains only on normal data
# fits several models with different seeds and averages their scores
# finally saves roc and aupr for each dataset into a csv
def run_iforest(config):
    all_results = []

    for fname in os.listdir(config.input_path):
        if not fname.endswith('.csv'):
            continue

        name = fname.rsplit('.', 1)[0]
        print(f"\nProcessing: {name}")
        x, y = dataLoading(os.path.join(config.input_path, fname))

        # use first seed for splitting
        seed_for_split = config.random_seeds[0]

        # train/val/test split
        x_train, x_temp, y_train, y_temp = train_test_split(
            x, y, test_size=0.3, stratify=y, random_state=seed_for_split
        )
        x_val, x_test, y_val, y_test = train_test_split(
            x_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=seed_for_split
        )

        # scale data robustly
        scaler = RobustScaler()
        x_train = scaler.fit_transform(x_train)
        x_val = scaler.transform(x_val)
        x_test = scaler.transform(x_test)

        # keep only normal samples for training
        normal_idx = y_train == 0
        x_train_clean = x_train[normal_idx]

        scores_list = []
        for seed in config.random_seeds:
            clf = IsolationForest(
                n_estimators=config.n_estimators,
                random_state=seed
            )
            clf.fit(x_train_clean)
            scores = -clf.decision_function(x_test)
            scores_list.append(scores)

        avg_scores = np.mean(scores_list, axis=0)

        roc = roc_auc_score(y_test, avg_scores)
        aupr = average_precision_score(y_test, avg_scores)
        print(f"{name}: ROC AUC = {roc:.4f}, AUPR = {aupr:.4f}")

        all_results.append({'dataset': name, 'roc': roc, 'aupr': aupr})

    # save results to csv
    results_df = pd.DataFrame(all_results)
    output_csv_filename = "all_dataset_results_iforest_improved.csv"
    results_df.to_csv(output_csv_filename, index=False)
    print("\nSummary results saved to", output_csv_filename)

# entry point with config settings
if __name__ == "__main__":
    class Config:
        pass

    cfg = Config()
    cfg.input_path = './dataset/'
    cfg.n_estimators = 200            # number of trees in the forest
    cfg.random_seeds = [42, 43, 44]  



Processing: annthyroid_21feat_normalised
annthyroid_21feat_normalised: ROC AUC = 0.7728, AUPR = 0.2336

Processing: bank-additional-full_normalised
bank-additional-full_normalised: ROC AUC = 0.7325, AUPR = 0.3024

Processing: celeba_baldvsnonbald_normalised
celeba_baldvsnonbald_normalised: ROC AUC = 0.7151, AUPR = 0.0631

Processing: census-income-full-mixed-binarized
census-income-full-mixed-binarized: ROC AUC = 0.6239, AUPR = 0.0753

Processing: creditcardfraud_normalised
creditcardfraud_normalised: ROC AUC = 0.9454, AUPR = 0.1330

Processing: KDD2014_donors_10feat_nomissing_normalised
KDD2014_donors_10feat_nomissing_normalised: ROC AUC = 0.8945, AUPR = 0.2537

Processing: UNSW_NB15_traintest_backdoor
UNSW_NB15_traintest_backdoor: ROC AUC = 0.7673, AUPR = 0.0513

Summary results saved to all_dataset_results_iforest_improved.csv
