# Import

In [1]:
import random
import os
from collections import defaultdict
from tqdm import tqdm
import csv
from datetime import datetime


import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import recall_score, classification_report, confusion_matrix, make_scorer
from sklearn.inspection import permutation_importance
import xgboost as xgb
import matplotlib.pyplot as plt


def set_all_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    

def log(record, name: str = 'log__testing.csv'):
    with open(name, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter=';', dialect='excel')
        writer.writerow([datetime.now()] + [record])


random_seed = 1234
set_all_seeds(random_seed)

# Parameters

In [2]:
dataset_path = 'data/dataset_after_eda.parquet'
target_column = 'target'
datetime_column = '17'
time_split = '2025-04-15'

# Testing process

## Functions

In [3]:
def model_report(y_true, y_pred, model_name = '_'):
    print('\n---------------------------------------------------------------')
    print(f"Model: {model_name}")
    print(f"Test data length: {len(y_true)}")
    print(f"Fraud sessions in test data: {sum(y_true)}")
    
    print(classification_report(y_true, y_pred))
    
    if len(set(y_true))>1:
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()        
        
        print(f'\nTN: {tn}')
        print(f'FP: {fp}')
        print(f'FN: {fn}')
        print(f'TP: {tp}')   
        
        fpr = fp/(fp+tn) 
        fnr = fn/(fn+tp)
        print('False Negative Rate = ',fnr)
        print('False Positive Rate = ',fpr)
    else:
        print('One label, fpr is out function scope.')
        

def find_threshold_fpr(y_true, y_pred_proba, target_fpr):
    sorted_indices = np.argsort(y_pred_proba)[::-1]
    y_true_sorted = y_true[sorted_indices]
    y_pred_proba_sorted = y_pred_proba[sorted_indices]
    false_pos = 0.0
    threshold = 0.0
    for i in range(len(y_true_sorted)):
        if y_true_sorted[i] == 0:
            false_pos += 1
        if false_pos / len(y_true_sorted) >= target_fpr:
            threshold = y_pred_proba_sorted[i]
            break
    return threshold


def get_recall_on_fpr(y_true, y_proba, target_fpr):
    threshold = find_threshold_fpr(y_true, y_proba, target_fpr)
    y_pred_binary = np.where(y_proba >= threshold, 1, 0)
    score = recall_score(y_true, y_pred_binary)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred_binary).ravel()
    fpr = fp / (fp + tn)
    if fpr > target_fpr * 2:
        return 0
    return score


def train_test_split_df(X, 
                        y, 
                        class_probabilities={0:0.2,1:0.2}, 
                        shuffle=True,
                        random_state=random_seed):
    """
    A dictionary of shares is used with each class. These data shares are recorded for sample in the test set.
    Example: {0:0.02, 1:0.02}
    return:
        X_train, X_test, y_train, y_test
    """
    
    if random_state is not None:
        random.seed(random_state)

    class_indices = defaultdict(list)

    for idx, label in enumerate(y.to_numpy()):
        class_indices[label].append(idx)

    X_train, X_test, y_train, y_test = [], [], [], []

    for label, prob in class_probabilities.items():
        indices = class_indices[label]
        split_index = int((1 - prob) * len(indices))
        if shuffle:
            random.shuffle(indices)

        train_indices, test_indices = indices[:split_index], indices[split_index:]

        X_train.append(X.iloc[train_indices])
        X_test.append(X.iloc[test_indices])
        y_train.append(y.iloc[train_indices])
        y_test.append(y.iloc[test_indices]) 
        
    X_train = pd.concat([*X_train],ignore_index=True)
    X_test = pd.concat([*X_test],ignore_index=True)
    y_train = pd.concat([*y_train],ignore_index=True)
    y_test = pd.concat([*y_test],ignore_index=True)
    
    return X_train, X_test, y_train, y_test
    

def make_test(df, target_column, datetime_column, split_date, class_0_share=0.1):
    
    df[datetime_column] = pd.to_datetime(df[datetime_column])
    
    dt_train_fraud = df[(df[datetime_column] < split_date) & (df[target_column]==1)]
    df_test_fraud = df[(df[datetime_column] >= split_date) & (df[target_column]==1)]
                        
    df_test_clear = df[df[target_column]==0].sample(frac=class_0_share, random_state=random_seed)
    df_train_clear = df[df[target_column]==0].drop(df_test_clear.index)
    
    train = pd.concat([dt_train_fraud, df_train_clear],ignore_index=True).drop(columns=[datetime_column])
    test = pd.concat([df_test_fraud, df_test_clear],ignore_index=True).drop(columns=[datetime_column])
    
    return train, test

## Preprocessing

In [4]:
df = pd.read_parquet(dataset_path)
cols_to_drop = ['252', '414', '127', '228', '398', '226', '209', '109', '396', '243', '380', '98', '271', '268', '194', '400', '394', '429', '120', '239', '215', '181', '180', '416', '67', '256', '151', '419', '240', '158', '34', '164', '258', '424', '415', '168', '438', '283', '403', '152', '102', '60', '129', '225', '169', '409', '83', '26', '406', '413', '200', '244', '247', '430', '266', '96', '410', '161', '251']
strict_perm = ['142','389','390']
cols_to_drop += strict_perm
df = df.drop(columns=cols_to_drop)
train_val_df, test_df = make_test(df, 
                                  target_column, 
                                  datetime_column, 
                                  pd.Timestamp(time_split), 
                                  class_0_share=0.1)
X_train, X_val, y_train, y_val = train_test_split_df(train_val_df.drop(columns=[target_column]),
                                                     train_val_df[target_column], 
                                                     class_probabilities={0:0.1, 1:0.1}, 
                                                     shuffle=True,
                                                     random_state=random_seed)

## Testing

In [11]:
params = {'n_estimators': 190, 'max_depth': 7, 'learning_rate': 0.557, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 3497.0, 'reg_alpha': 4.16, 'reg_lambda': 8.61, 'gamma': 0.37}

model = xgb.XGBClassifier(**params, random_state=random_seed)
model.fit(train_val_df.drop(columns=[target_column]),
          train_val_df[target_column])
y_pred = [x>=0.5 for x in model.predict_proba(test_df.drop(columns=[target_column]))[:, 1]]
model_report(test_df[target_column], y_pred, 'Final model testing')


---------------------------------------------------------------
Model: Final model testing
Test data length: 39730
Fraud sessions in test data: 286.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     39444
         1.0       0.66      0.72      0.69       286

    accuracy                           1.00     39730
   macro avg       0.83      0.86      0.84     39730
weighted avg       1.00      1.00      1.00     39730


TN: 39339
FP: 105
FN: 81
TP: 205
False Negative Rate =  0.28321678321678323
False Positive Rate =  0.00266200182537268
