In [1]:
#Regular Importing
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
%%capture output
# Keras Importing
!pip install tensorflow
!pip install keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
%%capture output
# Importing for seeing the missing Data
!pip install missingno
import missingno as msno

## Does TabNet Do Better Than Keras

In [4]:
%%capture output
# Tabnet Importing
!pip install pytorch_tabnet
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier
import pytorch_tabnet
import torch

In [5]:
# Data Loading
testDf = pd.read_csv("WFTest.csv")
trainDf = pd.read_csv("WFTrain.csv")
trainDf.head()

Unnamed: 0,TRAN_AMT,ACCT_PRE_TRAN_AVAIL_BAL,CUST_AGE,OPEN_ACCT_CT,WF_dvc_age,PWD_UPDT_TS,CARR_NAME,RGN_NAME,STATE_PRVNC_TXT,ALERT_TRGR_CD,...,CUST_STATE,PH_NUM_UPDT_TS,CUST_SINCE_DT,TRAN_TS,TRAN_DT,ACTN_CD,ACTN_INTNL_TXT,TRAN_TYPE_CD,ACTVY_DT,FRAUD_NONFRAUD
0,5.38,23619.91,47,4,2777,1/16/2018 11:3:58,cox communications inc.,southwest,nevada,MOBL,...,NV,2/24/2021 15:55:10,1993-01-06 00:00:00,5/3/2021 18:3:58,5/3/2021,SCHPMT,P2P_COMMIT,P2P,5/3/2021,Non-Fraud
1,65.19,0.0,45,5,2721,,charter communications,southwest,california,MOBL,...,CA,,1971-01-07 00:00:00,1/13/2021 19:19:37,1/13/2021,SCHPMT,P2P_COMMIT,P2P,1/13/2021,Non-Fraud
2,54.84,34570.63,36,8,1531,12/22/2021 10:42:51,utah broadband llc,mountain,utah,ONLN,...,MD,5/5/2019 1:8:39,1994-02-01 00:00:00,4/8/2021 9:42:51,4/8/2021,SCHPMT,P2P_COMMIT,P2P,4/8/2021,Fraud
3,0.01,0.0,62,3,835,2/8/2020 7:28:31,t-mobile usa inc.,southwest,california,MOBL,...,NV,2/16/2019 6:45:37,2001-11-01 00:00:00,8/10/2021 15:28:31,8/10/2021,SCHPMT,P2P_COMMIT,P2P,8/10/2021,Non-Fraud
4,497.08,12725.18,81,2,1095,12/28/2020 12:12:44,cogent communications,south central,texas,MOBL,...,UT,5/8/2020 10:27:6,1987-02-07 00:00:00,6/27/2021 11:12:44,6/27/2021,SCHPMT,P2P_COMMIT,P2P,6/27/2021,Fraud


In [6]:
# Setting up the Features
numerical = ['TRAN_AMT', 'ACCT_PRE_TRAN_AVAIL_BAL','CUST_AGE',
             'OPEN_ACCT_CT', 'WF_dvc_age', 'CUST_ZIP']
categorical = ['CARR_NAME', 'RGN_NAME', 'STATE_PRVNC_TXT', 'ALERT_TRGR_CD',
                  'DVC_TYPE_TXT', 'AUTHC_PRIM_TYPE_CD', 'AUTHC_SCNDRY_STAT_TXT',
                  'CUST_STATE','ACTN_CD','ACTN_INTNL_TXT','TRAN_TYPE_CD']
X_cat = pd.get_dummies(trainDf[categorical])
X_num = trainDf[numerical]
X = pd.concat([X_num, X_cat], axis = 1)
X_mean_imputed = X.fillna(X.mean())
X_mean_imputed_numpy = X_mean_imputed.to_numpy()
X_mean_imputed_numpy

array([[5.380000e+00, 2.361991e+04, 4.700000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       [6.519000e+01, 0.000000e+00, 4.500000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       [5.484000e+01, 3.457063e+04, 3.600000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       ...,
       [4.930000e+02, 2.848630e+03, 5.400000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       [4.916400e+02, 3.163250e+03, 2.100000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       [6.020000e+00, 0.000000e+00, 6.000000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00]])

In [7]:
# Getting the Y values
Y = trainDf["FRAUD_NONFRAUD"]
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
encoded_Y

array([1, 1, 0, ..., 0, 0, 1])

In [8]:
# Need a validation set for this, Keras can wrap tabnet and do cv but not sure needed
x_train, x_val, y_train, y_val = train_test_split(X_mean_imputed_numpy, encoded_Y, test_size=0.30, random_state=8)
x_train

array([[1.00000e-02, 0.00000e+00, 2.10000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       [1.00000e-02, 1.01590e+03, 2.90000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       [1.00000e-02, 0.00000e+00, 4.20000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       ...,
       [4.62700e+01, 0.00000e+00, 4.90000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       [8.26600e+01, 1.84563e+03, 3.70000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       [8.70000e-01, 8.64068e+03, 7.40000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00]])

In [9]:
# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax', # "sparsemax",
    )

unsupervised_model.fit(
    x_train,
    eval_set=[x_val],
    max_epochs=500 , patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    pretraining_ratio=0.8,
)

# reconstructed_X, embedded_X = unsupervised_model_no_preproc.predict(x_val)
# assert(reconstructed_X.shape==embedded_X.shape)

Device used : cpu
epoch 0  | loss: 149399290.26347| val_0_unsup_loss: 3893763.75|  0:00:06s
epoch 1  | loss: 1427506.32939| val_0_unsup_loss: 97364.42969|  0:00:11s
epoch 2  | loss: 178502.06379| val_0_unsup_loss: 493305.59375|  0:00:16s
epoch 3  | loss: 87639.77717| val_0_unsup_loss: 970.84985|  0:00:21s
epoch 4  | loss: 83132.60763| val_0_unsup_loss: 5895.03027|  0:00:26s
epoch 5  | loss: 68334.74405| val_0_unsup_loss: 1597.69641|  0:00:32s
epoch 6  | loss: 49499.34829| val_0_unsup_loss: 430.88806|  0:00:37s
epoch 7  | loss: 42426.19051| val_0_unsup_loss: 7846.76514|  0:00:42s
epoch 8  | loss: 17970.08185| val_0_unsup_loss: 65.60343|  0:00:47s
epoch 9  | loss: 29841.2788| val_0_unsup_loss: 2391.82202|  0:00:53s
epoch 10 | loss: 41208.95511| val_0_unsup_loss: 8147.67432|  0:00:58s
epoch 11 | loss: 6907.6766| val_0_unsup_loss: 278946.75|  0:01:04s
epoch 12 | loss: 4198.52499| val_0_unsup_loss: 10026666.0|  0:01:11s
epoch 13 | loss: 1830.67996| val_0_unsup_loss: 432816.6875|  0:01:17s
e

In [10]:
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

clf.fit(
    X_train= x_train, y_train=y_train,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    from_unsupervised=unsupervised_model,
    max_epochs=500 , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

Device used : cpu
Loading weights from unsupervised pretraining




epoch 0  | loss: 0.70283 | train_auc: 0.57719 | valid_auc: 0.56602 |  0:00:08s
epoch 1  | loss: 0.68873 | train_auc: 0.61878 | valid_auc: 0.62465 |  0:00:16s
epoch 2  | loss: 0.6715  | train_auc: 0.68693 | valid_auc: 0.68856 |  0:00:28s
epoch 3  | loss: 0.62049 | train_auc: 0.82575 | valid_auc: 0.82119 |  0:00:39s
epoch 4  | loss: 0.55308 | train_auc: 0.85003 | valid_auc: 0.83134 |  0:00:48s
epoch 5  | loss: 0.50318 | train_auc: 0.87071 | valid_auc: 0.8463  |  0:00:57s
epoch 6  | loss: 0.46945 | train_auc: 0.89226 | valid_auc: 0.86954 |  0:01:07s
epoch 7  | loss: 0.44369 | train_auc: 0.90446 | valid_auc: 0.87891 |  0:01:17s
epoch 8  | loss: 0.41996 | train_auc: 0.9029  | valid_auc: 0.87476 |  0:01:26s
epoch 9  | loss: 0.40872 | train_auc: 0.92269 | valid_auc: 0.89487 |  0:01:37s
epoch 10 | loss: 0.34585 | train_auc: 0.92864 | valid_auc: 0.90396 |  0:01:47s
epoch 11 | loss: 0.31113 | train_auc: 0.93556 | valid_auc: 0.90695 |  0:01:56s
epoch 12 | loss: 0.31529 | train_auc: 0.94278 | vali

In [11]:
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [18]:
# Predicting the Value
x_val_pred = clf.predict_proba(x_val)[:, 1]

In [33]:
# Getting it into a form that'll be gradeable
x_val_pred_label = x_val_pred > 0.25
x_val_pred_label = x_val_pred_label.astype(int)
x_val_pred_label_df = pd.DataFrame(x_val_pred_label)
y_val_df = pd.DataFrame(y_val)

In [34]:
f1s = f1_score(y_val_df, x_val_pred_label_df)
print("Baseline: %.2f%%" % (f1s*100))

Baseline: 91.51%
