# Credit Card Fraud Detection with AutoXGB
## Part 1 - Data Acquisition and Processing
___

### (i) Setup dependencies and dataset

In [1]:
# Library imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import time
import pickle
import datetime
import shutil

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

#### Download raw data
- Data source: https://github.com/Fraud-Detection-Handbook/simulated-data-transformed

In [2]:
# Download raw data
dst_folder = 'data/raw'

if not os.path.exists(dst_folder):
    os.makedirs(dst_folder)
    !git clone https://github.com/Fraud-Detection-Handbook/simulated-data-transformed/ data/raw

In [3]:
os.listdir(f'{dst_folder}/data')

['2018-04-01.pkl',
 '2018-04-02.pkl',
 '2018-04-03.pkl',
 '2018-04-04.pkl',
 '2018-04-05.pkl',
 '2018-04-06.pkl',
 '2018-04-07.pkl',
 '2018-04-08.pkl',
 '2018-04-09.pkl',
 '2018-04-10.pkl',
 '2018-04-11.pkl',
 '2018-04-12.pkl',
 '2018-04-13.pkl',
 '2018-04-14.pkl',
 '2018-04-15.pkl',
 '2018-04-16.pkl',
 '2018-04-17.pkl',
 '2018-04-18.pkl',
 '2018-04-19.pkl',
 '2018-04-20.pkl',
 '2018-04-21.pkl',
 '2018-04-22.pkl',
 '2018-04-23.pkl',
 '2018-04-24.pkl',
 '2018-04-25.pkl',
 '2018-04-26.pkl',
 '2018-04-27.pkl',
 '2018-04-28.pkl',
 '2018-04-29.pkl',
 '2018-04-30.pkl',
 '2018-05-01.pkl',
 '2018-05-02.pkl',
 '2018-05-03.pkl',
 '2018-05-04.pkl',
 '2018-05-05.pkl',
 '2018-05-06.pkl',
 '2018-05-07.pkl',
 '2018-05-08.pkl',
 '2018-05-09.pkl',
 '2018-05-10.pkl',
 '2018-05-11.pkl',
 '2018-05-12.pkl',
 '2018-05-13.pkl',
 '2018-05-14.pkl',
 '2018-05-15.pkl',
 '2018-05-16.pkl',
 '2018-05-17.pkl',
 '2018-05-18.pkl',
 '2018-05-19.pkl',
 '2018-05-20.pkl',
 '2018-05-21.pkl',
 '2018-05-22.pkl',
 '2018-05-23

#### Create train and test datasets
- ~2 months period for **train** set: 2018-06-01 to 2018-09-10
- 1 week Feedback Delay between train and test set: 2018-09-11 to 2017-09-17
- 1 week period for **test** set: 2018-09-18 to 2018-09-24

*Details on Feedback Delay Period*
- We choose our test set to take place one week after the last transaction of the training set. In a fraud detection context, this period separating the training and test set is referred to as the delay period or feedback delay. 
- It accounts for the fact that, in a real-world fraud detection system, the label of a transaction (fraudulent or genuine) is only known after a customer complaint, or thanks to the result of a fraud investigation. 
- Therefore, in a realistic scenario, the annotated data available to train a model and start making prediction for a given day are anterior to that day minus the delay period. 
- A one-week delay is, to a first approximation, a reasonable basis. From experience, statistics generally show that most of the feedback becomes available after a one week delay and empirically, it is also the point where the negative effect of considering that zero feedback is available before and the positive effect of considering that all the feedback is available after cancel each other out.

In [4]:
# Define date range
train_date_range = pd.date_range('2018-07-01', '2018-09-10', freq='d').strftime('%Y-%m-%d').tolist()
test_date_range = pd.date_range('2018-09-18', '2018-09-24', freq='d').strftime('%Y-%m-%d').tolist()

In [5]:
# Generate train dataframe
train_df = pd.DataFrame()
for date in train_date_range:
    df = pd.read_pickle(f'{dst_folder}/data/{date}.pkl')
    train_df = train_df.append(df)

len(train_df)

689598

In [6]:
# Generate test dataframe
test_df = pd.DataFrame()
for date in test_date_range:
    df = pd.read_pickle(f'{dst_folder}/data/{date}.pkl')
    test_df = test_df.append(df)

len(test_df)

67416

___
### (iii) Data-Preprocessing
- Baseline feature transformation already done on the simulated raw dataset based on: https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_3_GettingStarted/BaselineFeatureTransformation.html

In [7]:
train_df.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DURING_WEEKEND,TX_DURING_NIGHT,CUSTOMER_ID_NB_TX_1DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW,CUSTOMER_ID_NB_TX_7DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,CUSTOMER_ID_NB_TX_30DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW,TERMINAL_ID_NB_TX_1DAY_WINDOW,TERMINAL_ID_RISK_1DAY_WINDOW,TERMINAL_ID_NB_TX_7DAY_WINDOW,TERMINAL_ID_RISK_7DAY_WINDOW,TERMINAL_ID_NB_TX_30DAY_WINDOW,TERMINAL_ID_RISK_30DAY_WINDOW
872795,872795,2018-07-01 00:02:06,4984,425,74.37,7862526,91,0,0,1,1,8.0,55.13125,21.0,52.041905,68.0,53.391618,1.0,0.0,4.0,0.0,27.0,0.037037
872796,872797,2018-07-01 00:02:34,4357,4656,109.96,7862554,91,0,0,1,1,4.0,107.98,26.0,93.211154,109.0,97.145046,1.0,0.0,5.0,0.0,43.0,0.0
872797,872796,2018-07-01 00:02:34,2311,2211,18.26,7862554,91,0,0,1,1,2.0,14.56,10.0,11.06,74.0,11.367297,1.0,0.0,4.0,0.0,18.0,0.0
872798,872798,2018-07-01 00:02:59,1373,934,44.44,7862579,91,0,0,1,1,3.0,95.513333,23.0,85.214783,86.0,91.759302,2.0,0.0,8.0,0.0,29.0,0.0
872799,872799,2018-07-01 00:03:28,3908,857,24.75,7862608,91,0,0,1,1,1.0,24.75,2.0,37.285,13.0,36.265385,0.0,0.0,5.0,0.0,32.0,0.03125


In [8]:
target = 'TX_FRAUD'

predictors = ['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 
              'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
              'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
              'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
              'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
              'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
              'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
              'TERMINAL_ID_RISK_30DAY_WINDOW']

In [9]:
# Keep only predictor and target columns
train_df = train_df[predictors + [target]]
test_df = test_df[predictors + [target]]

In [10]:
# Assess degree of imbalance in target label
train_df[target].value_counts(dropna=False)

0    683458
1      6140
Name: TX_FRAUD, dtype: int64

Significant degree of target label imbalance (Fraud cases only 0.9% of entire dataset)

#### Perform sampling strategy
1. SMOTE oversampling to make minority labels 10% of total dataset, then
2. Random undersampling to make minority 50% of majority class

Final sampled ratio will be majority:minority = 2:1

In [11]:
# Define sampling techniques
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

steps = [('over', over), ('under', under)]
pipeline = Pipeline(steps=steps)

In [12]:
# Rearrange dataset into X, y
X_train = train_df[predictors]
y_train = train_df[target]

In [13]:
# Execute sampling strategy
X_train, y_train = pipeline.fit_resample(X_train, y_train)

In [14]:
len(X_train)

205035

In [15]:
y_train.value_counts()

0    136690
1     68345
Name: TX_FRAUD, dtype: int64

### Save Processed Datasets

In [16]:
processed_folder = 'data/processed'
if not os.path.exists(processed_folder):
    os.makedirs(processed_folder)
    
test_df.to_csv(f'{processed_folder}/test.csv', index=False)

In [17]:
# Combine X_train and y_train together for saving as processed train dataset
y_train_df = pd.DataFrame(y_train, columns=['TX_FRAUD'])
train_processed = pd.concat([X_train, y_train_df], axis=1)
train_processed.to_csv(f'{processed_folder}/train.csv', index=False)

#### References
- https://www.kaggle.com/mlg-ulb/creditcardfraud
- https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_References/shared_functions.html
- https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets
- https://github.com/Fraud-Detection-Handbook/simulated-data-raw/tree/main/data