In [None]:
import pandas as np
import numpy as pd
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, average_precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time

In [None]:
def downsample(train_normal, train_ano, test, ratio_normal, ratio_ano, seed):
    """
     change the ratio of normal transactions and anomalies in the training set
     input: training set, ratio of # normal transactions in the training/ # normal transactions, testing set
    :return: AURPC
    """
    if seed is None:
        seed = 0

    # downsampling
    sub_normal = train_normal.sample(int(len(train_normal)*ratio), random_state=seed)
    if ratio_ano is not None:
        sub_ano = train_normal.sample(int(len(train_ano)*ratio), random_state=seed)
        sub_train = pd.concat([sub_normal, sub_ano], ignore_index=True)
    else:
        sub_train = pd.concat([sub_normal, train_ano], ignore_index=True)

    # training
    Y_train = sub_train["Label"].values
    X_train = sub_train.drop(["Label"], axis=1).values
    Y_test = test["Label"].values
    X_test = test.drop(["Label"], axis=1).values

    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    start_time = time.time()

    # fitting
    xgb = XGBClassifier(n_estimators=100)

    xgb.fit(X_train, Y_train)

 #  pred_xgb = xgb.predict(X_test)
 #   con_report = confusion_matrix(Y_test, pred_xgb)

    pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]
    auprc = average_precision_score(y_true=Y_test, y_score=pred_proba_xgb)
    end_time = time.time()
    time_fit = end_time - start_time
    print(time_fit)

    return  auprc

In [None]:
train = pd.read_csv('../../Desktop/Swift-PETs/data/HA_train.csv')
test = pd.read_csv('../../Desktop/Swift-PETs/data/HA_test.csv')

train_normal = train[train['Label'] == 0]
train_ano = train[train['Label'] == 1]

res = {}
for r in [0.9, 0.7, 0.5, 0.3, 0.1]:
    for sd in [0, 103, 204, 345, 4600]:
        if r in res:
            res[r].append(downsample_ratio(train_normal, train_ano,test, ratio_normal = r,[], seed = sd)['aucpr'])
        else:
            res[r] = [downsample_ratio(train_normal, train_ano,test, ratio_normal = r,[], seed = sd)['aucpr']]

print(res)