In [1]:
import pandas as pd
from utils import Timer
import featuretools as ft
import pandas as pd
import numpy as np
from woodwork.logical_types import Categorical, Datetime, Double
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Boolean, BooleanNullable
import lightgbm as lgbm



In [2]:
file = "/mnt/DP_disk1/ht/datasets/autofe/fraud_detect/card_transaction.v1.csv"
with Timer("read train data"):
    df =  pd.read_csv(file)
df['Amount'] = df['Amount'].str.replace('$', '')
df

read train data took 24.26013085618615 sec


  df['Amount'] = df['Amount'].str.replace('$', '')


Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2020,2,27,22:23,-54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,3054.0,5541,,No
24386896,1999,1,2020,2,27,22:24,54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,3054.0,5541,,No
24386897,1999,1,2020,2,28,07:43,59.15,Chip Transaction,2500998799892805156,Merrimack,NH,3054.0,4121,,No
24386898,1999,1,2020,2,28,20:10,43.12,Chip Transaction,2500998799892805156,Merrimack,NH,3054.0,4121,,No


In [3]:
feature_logical_types = {
    'User': Categorical, 
    'Card': Categorical,
    'Time': Datetime,
    'Amount': Double,
    'Use Chip': Categorical,
    'Merchant Name': Categorical,
    'Merchant City': Categorical,
    'Merchant State': Categorical,
    'Zip': Categorical,
    'MCC': Categorical,
    'Errors?': Categorical,
    'Is Fraud?': Boolean,
}

with Timer("Load data to entityset"):
    es = ft.EntitySet("fraud_detect")
    es.add_dataframe(dataframe_name="fraud_detect",
                     dataframe=df,
                     index="id", 
                     logical_types=feature_logical_types,
                     )

with Timer("DFS feature generation"):
    ret_df, features = ft.dfs(
        entityset=es, 
        target_dataframe_name="fraud_detect", 
        verbose=True,
        )

for col in ret_df.columns:
    if isinstance(ret_df[col].dtype, pd.BooleanDtype):
        ret_df[col] = ret_df[col].astype(bool)
    feature_name = col
    if '?' in feature_name:
        feature_name = feature_name.replace('?', '')
    if ' ' in feature_name:
        feature_name = feature_name.replace(' ', '_')
    ret_df = ret_df.rename(columns={col: feature_name})

ret_df.info()



Load data to entityset took 53.65264138393104 sec
Built 18 features
Elapsed: 04:02 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
DFS feature generation took 242.31067793816328 sec
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24386900 entries, 0 to 24386899
Data columns (total 18 columns):
 #   Column          Dtype   
---  ------          -----   
 0   User            category
 1   Card            category
 2   Year            int64   
 3   Month           int64   
 4   Day             int64   
 5   Amount          float64 
 6   Use_Chip        category
 7   Merchant_Name   category
 8   Merchant_City   category
 9   Merchant_State  category
 10  Zip             category
 11  MCC             category
 12  Errors          category
 13  Is_Fraud        bool    
 14  DAY(Time)       category
 15  MONTH(Time)     category
 16  WEEKDAY(T

In [4]:
from sklearn.metrics import average_precision_score, precision_recall_curve, auc
import xgboost as xgb  

target_label = 'Is_Fraud'
model_params = {
    'eval_metric': 'aucpr',
    'objective': 'binary:logistic',
    'tree_method': 'hist',
    'random_state': 42,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.6,
    'colsample_bytree': 0.8,
    'min_child_weight': 9,
}
training_params = {
    'num_boost_round': 1000,
    'verbose_eval': 100,
}

transformed_train_df = ret_df[ret_df['Year'] < 2018]
transformed_valid_df = ret_df[ret_df['Year'] == 2018]
transformed_test_df = ret_df[ret_df['Year'] > 2018]

exclude_features = []
features_train = [i for i in transformed_valid_df.columns if i not in [target_label] + exclude_features]
dtrain = xgb.DMatrix(data=transformed_train_df[features_train], label=transformed_train_df[target_label], enable_categorical=True)
dvalid = xgb.DMatrix(data=transformed_valid_df[features_train], label=transformed_valid_df[target_label], enable_categorical=True)
dtest = xgb.DMatrix(data=transformed_test_df[features_train], label=transformed_test_df[target_label], enable_categorical=True)

watch_list = [(dtrain,'train'), (dvalid, 'eval'), (dtest, 'test')]
# watch_list = [(dtrain,'train'), (dtest, 'test')]
with Timer("XGBoost training"):
    model = xgb.train(model_params, **training_params, dtrain=dtrain, evals=watch_list)

with Timer("XGBoost test"):
    probs = model.predict(dtest)
    precision, recall, _ = precision_recall_curve(transformed_test_df[target_label], probs)
    test_result = auc(recall, precision)
print(f"testing results aucpr on test set is {test_result}")

[0]	train-aucpr:0.64481	eval-aucpr:0.33236	test-aucpr:0.27807
[100]	train-aucpr:0.76455	eval-aucpr:0.89464	test-aucpr:0.83520
[200]	train-aucpr:0.78819	eval-aucpr:0.93552	test-aucpr:0.89875
[300]	train-aucpr:0.80470	eval-aucpr:0.94245	test-aucpr:0.91007
[400]	train-aucpr:0.81513	eval-aucpr:0.94320	test-aucpr:0.90924
[500]	train-aucpr:0.82220	eval-aucpr:0.94431	test-aucpr:0.91122
[600]	train-aucpr:0.82808	eval-aucpr:0.94290	test-aucpr:0.90744
[700]	train-aucpr:0.83386	eval-aucpr:0.94190	test-aucpr:0.90518
[800]	train-aucpr:0.83910	eval-aucpr:0.94147	test-aucpr:0.90352
[900]	train-aucpr:0.84336	eval-aucpr:0.94087	test-aucpr:0.90131
[999]	train-aucpr:0.84736	eval-aucpr:0.94095	test-aucpr:0.90102
XGBoost training took 1417.9510826803744 sec
XGBoost test took 2.2313696779310703 sec
testing results aucpr on test set is 0.9010234711827889
