In [2]:
#Regular Importing
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [3]:
%%capture output
# Keras Importing
!pip install tensorflow
!pip install keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [4]:
%%capture output
# Importing for seeing the missing Data
!pip install missingno
import missingno as msno

## Does TabNet Do Better Than Keras

In [5]:
%%capture output
# Tabnet Importing
!pip install pytorch_tabnet
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier
import pytorch_tabnet
import torch

In [5]:
# Data Loading
testDf = pd.read_csv("WFTest.csv")
trainDf = pd.read_csv("WFTrain.csv")
trainDf.head()

Unnamed: 0,TRAN_AMT,ACCT_PRE_TRAN_AVAIL_BAL,CUST_AGE,OPEN_ACCT_CT,WF_dvc_age,PWD_UPDT_TS,CARR_NAME,RGN_NAME,STATE_PRVNC_TXT,ALERT_TRGR_CD,...,CUST_STATE,PH_NUM_UPDT_TS,CUST_SINCE_DT,TRAN_TS,TRAN_DT,ACTN_CD,ACTN_INTNL_TXT,TRAN_TYPE_CD,ACTVY_DT,FRAUD_NONFRAUD
0,5.38,23619.91,47,4,2777,1/16/2018 11:3:58,cox communications inc.,southwest,nevada,MOBL,...,NV,2/24/2021 15:55:10,1993-01-06 00:00:00,5/3/2021 18:3:58,5/3/2021,SCHPMT,P2P_COMMIT,P2P,5/3/2021,Non-Fraud
1,65.19,0.0,45,5,2721,,charter communications,southwest,california,MOBL,...,CA,,1971-01-07 00:00:00,1/13/2021 19:19:37,1/13/2021,SCHPMT,P2P_COMMIT,P2P,1/13/2021,Non-Fraud
2,54.84,34570.63,36,8,1531,12/22/2021 10:42:51,utah broadband llc,mountain,utah,ONLN,...,MD,5/5/2019 1:8:39,1994-02-01 00:00:00,4/8/2021 9:42:51,4/8/2021,SCHPMT,P2P_COMMIT,P2P,4/8/2021,Fraud
3,0.01,0.0,62,3,835,2/8/2020 7:28:31,t-mobile usa inc.,southwest,california,MOBL,...,NV,2/16/2019 6:45:37,2001-11-01 00:00:00,8/10/2021 15:28:31,8/10/2021,SCHPMT,P2P_COMMIT,P2P,8/10/2021,Non-Fraud
4,497.08,12725.18,81,2,1095,12/28/2020 12:12:44,cogent communications,south central,texas,MOBL,...,UT,5/8/2020 10:27:6,1987-02-07 00:00:00,6/27/2021 11:12:44,6/27/2021,SCHPMT,P2P_COMMIT,P2P,6/27/2021,Fraud


In [6]:
# Setting up the Features
numerical = ['TRAN_AMT', 'ACCT_PRE_TRAN_AVAIL_BAL','CUST_AGE',
             'OPEN_ACCT_CT', 'WF_dvc_age', 'CUST_ZIP']
categorical = ['CARR_NAME', 'RGN_NAME', 'STATE_PRVNC_TXT', 'ALERT_TRGR_CD',
                  'DVC_TYPE_TXT', 'AUTHC_PRIM_TYPE_CD', 'AUTHC_SCNDRY_STAT_TXT',
                  'CUST_STATE','ACTN_CD','ACTN_INTNL_TXT','TRAN_TYPE_CD']
X_cat = pd.get_dummies(trainDf[categorical])
X_num = trainDf[numerical]
X = pd.concat([X_num, X_cat], axis = 1)
X_mean_imputed = X.fillna(X.mean())
X_mean_imputed_numpy = X_mean_imputed.to_numpy()
X_mean_imputed_numpy

array([[5.380000e+00, 2.361991e+04, 4.700000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       [6.519000e+01, 0.000000e+00, 4.500000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       [5.484000e+01, 3.457063e+04, 3.600000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       ...,
       [4.930000e+02, 2.848630e+03, 5.400000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       [4.916400e+02, 3.163250e+03, 2.100000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00],
       [6.020000e+00, 0.000000e+00, 6.000000e+01, ..., 1.000000e+00,
        1.000000e+00, 1.000000e+00]])

In [7]:
# Getting the Y values
Y = trainDf["FRAUD_NONFRAUD"]
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
encoded_Y

array([1, 1, 0, ..., 0, 0, 1])

In [8]:
# Need a validation set for this, Keras can wrap tabnet and do cv but not sure needed
x_train, x_val, y_train, y_val = train_test_split(X_mean_imputed_numpy, encoded_Y, test_size=0.30, random_state=8)
x_train

array([[1.00000e-02, 0.00000e+00, 2.10000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       [1.00000e-02, 1.01590e+03, 2.90000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       [1.00000e-02, 0.00000e+00, 4.20000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       ...,
       [4.62700e+01, 0.00000e+00, 4.90000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       [8.26600e+01, 1.84563e+03, 3.70000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00],
       [8.70000e-01, 8.64068e+03, 7.40000e+01, ..., 1.00000e+00,
        1.00000e+00, 1.00000e+00]])

In [9]:
# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax', # "sparsemax",
    )

unsupervised_model.fit(
    x_train,
    eval_set=[x_val],
    max_epochs=500 , patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    pretraining_ratio=0.8,
)

# reconstructed_X, embedded_X = unsupervised_model_no_preproc.predict(x_val)
# assert(reconstructed_X.shape==embedded_X.shape)

Device used : cpu
epoch 0  | loss: 149399290.26347| val_0_unsup_loss: 3893763.75|  0:00:06s
epoch 1  | loss: 1427506.32939| val_0_unsup_loss: 97364.42969|  0:00:11s
epoch 2  | loss: 178502.06379| val_0_unsup_loss: 493305.59375|  0:00:16s
epoch 3  | loss: 87639.77717| val_0_unsup_loss: 970.84985|  0:00:21s
epoch 4  | loss: 83132.60763| val_0_unsup_loss: 5895.03027|  0:00:26s
epoch 5  | loss: 68334.74405| val_0_unsup_loss: 1597.69641|  0:00:32s
epoch 6  | loss: 49499.34829| val_0_unsup_loss: 430.88806|  0:00:37s
epoch 7  | loss: 42426.19051| val_0_unsup_loss: 7846.76514|  0:00:42s
epoch 8  | loss: 17970.08185| val_0_unsup_loss: 65.60343|  0:00:47s
epoch 9  | loss: 29841.2788| val_0_unsup_loss: 2391.82202|  0:00:53s
epoch 10 | loss: 41208.95511| val_0_unsup_loss: 8147.67432|  0:00:58s
epoch 11 | loss: 6907.6766| val_0_unsup_loss: 278946.75|  0:01:04s
epoch 12 | loss: 4198.52499| val_0_unsup_loss: 10026666.0|  0:01:11s
epoch 13 | loss: 1830.67996| val_0_unsup_loss: 432816.6875|  0:01:17s
e

In [10]:
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

clf.fit(
    X_train= x_train, y_train=y_train,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    from_unsupervised=unsupervised_model,
    max_epochs=500 , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

Device used : cpu
Loading weights from unsupervised pretraining




epoch 0  | loss: 0.70283 | train_auc: 0.57719 | valid_auc: 0.56602 |  0:00:08s
epoch 1  | loss: 0.68873 | train_auc: 0.61878 | valid_auc: 0.62465 |  0:00:16s
epoch 2  | loss: 0.6715  | train_auc: 0.68693 | valid_auc: 0.68856 |  0:00:28s
epoch 3  | loss: 0.62049 | train_auc: 0.82575 | valid_auc: 0.82119 |  0:00:39s
epoch 4  | loss: 0.55308 | train_auc: 0.85003 | valid_auc: 0.83134 |  0:00:48s
epoch 5  | loss: 0.50318 | train_auc: 0.87071 | valid_auc: 0.8463  |  0:00:57s
epoch 6  | loss: 0.46945 | train_auc: 0.89226 | valid_auc: 0.86954 |  0:01:07s
epoch 7  | loss: 0.44369 | train_auc: 0.90446 | valid_auc: 0.87891 |  0:01:17s
epoch 8  | loss: 0.41996 | train_auc: 0.9029  | valid_auc: 0.87476 |  0:01:26s
epoch 9  | loss: 0.40872 | train_auc: 0.92269 | valid_auc: 0.89487 |  0:01:37s
epoch 10 | loss: 0.34585 | train_auc: 0.92864 | valid_auc: 0.90396 |  0:01:47s
epoch 11 | loss: 0.31113 | train_auc: 0.93556 | valid_auc: 0.90695 |  0:01:56s
epoch 12 | loss: 0.31529 | train_auc: 0.94278 | vali

In [11]:
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [18]:
# Predicting the Value
x_val_pred = clf.predict_proba(x_val)[:, 1]

In [33]:
# Getting it into a form that'll be gradeable
x_val_pred_label = x_val_pred > 0.25
x_val_pred_label = x_val_pred_label.astype(int)
x_val_pred_label_df = pd.DataFrame(x_val_pred_label)
y_val_df = pd.DataFrame(y_val)

In [34]:
f1s = f1_score(y_val_df, x_val_pred_label_df)
print("Baseline: %.2f%%" % (f1s*100))

Baseline: 91.51%


## Let's Try Feature Engineering with Tabnet


In [6]:
# Trying this out with Josh's Feature Engineering
testDf = pd.read_csv("WFTest.csv")
trainDf = pd.read_csv("WFTrain.csv")

to_datetime = ['PWD_UPDT_TS', 'PH_NUM_UPDT_TS', 'CUST_SINCE_DT','TRAN_TS',
               'TRAN_DT', 'ACTVY_DT']
for datetime in to_datetime:
  trainDf[datetime] = pd.to_datetime(trainDf[datetime], errors='coerce')

to_categorical = ['CARR_NAME', 'RGN_NAME', 'STATE_PRVNC_TXT', 'ALERT_TRGR_CD',
                  'DVC_TYPE_TXT', 'AUTHC_PRIM_TYPE_CD', 'AUTHC_SCNDRY_STAT_TXT',
                  'CUST_STATE','ACTN_CD','ACTN_INTNL_TXT','TRAN_TYPE_CD',
                  'FRAUD_NONFRAUD']

for category in to_categorical:
  trainDf[category] = trainDf[category].astype("category")

redundant = ['ACTN_CD', 'TRAN_TYPE_CD','ACTN_INTNL_TXT','ACTVY_DT']
trainDf.drop(columns = redundant, inplace=True)

trainDf.FRAUD_NONFRAUD = trainDf.FRAUD_NONFRAUD == 'Fraud'
trainDf['FRAUD_NONFRAUD'] = trainDf['FRAUD_NONFRAUD'].astype(int)

trainDf

Unnamed: 0,TRAN_AMT,ACCT_PRE_TRAN_AVAIL_BAL,CUST_AGE,OPEN_ACCT_CT,WF_dvc_age,PWD_UPDT_TS,CARR_NAME,RGN_NAME,STATE_PRVNC_TXT,ALERT_TRGR_CD,DVC_TYPE_TXT,AUTHC_PRIM_TYPE_CD,AUTHC_SCNDRY_STAT_TXT,CUST_ZIP,CUST_STATE,PH_NUM_UPDT_TS,CUST_SINCE_DT,TRAN_TS,TRAN_DT,FRAUD_NONFRAUD
0,5.38,23619.91,47,4,2777,2018-01-16 11:03:58,cox communications inc.,southwest,nevada,MOBL,,UN_PWD,ALLOW,89002,NV,2021-02-24 15:55:10,1993-01-06,2021-05-03 18:03:58,2021-05-03,0
1,65.19,0.00,45,5,2721,NaT,charter communications,southwest,california,MOBL,,FACE_ID,ALLOW,94541,CA,NaT,1971-01-07,2021-01-13 19:19:37,2021-01-13,0
2,54.84,34570.63,36,8,1531,2021-12-22 10:42:51,utah broadband llc,mountain,utah,ONLN,DESKTOP,UN_PWD,ALLOW,21811,MD,2019-05-05 01:08:39,1994-02-01,2021-04-08 09:42:51,2021-04-08,1
3,0.01,0.00,62,3,835,2020-02-08 07:28:31,t-mobile usa inc.,southwest,california,MOBL,MOBILE,UN_PWD,ALLOW,89822,NV,2019-02-16 06:45:37,2001-11-01,2021-08-10 15:28:31,2021-08-10,0
4,497.08,12725.18,81,2,1095,2020-12-28 12:12:44,cogent communications,south central,texas,MOBL,MOBILE,UN_PWD,CHALLENGE_SUCCESS,84108,UT,2020-05-08 10:27:06,1987-02-07,2021-06-27 11:12:44,2021-06-27,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,1937.21,230.75,55,4,142,NaT,cellco partnership dba verizon wireless,southwest,california,MOBL,MOBILE,UN_PWD,ALLOW,92503,CA,2017-07-15 06:58:59,2001-06-05,2021-03-12 12:11:59,2021-03-12,0
13996,114.38,0.00,44,10,272,2017-11-02 04:28:20,t-mobile usa inc.,southwest,california,MOBL,MOBILE,FACE_ID,ALLOW,80478,CO,NaT,2010-06-03,2021-06-11 09:28:20,2021-06-11,0
13997,493.00,2848.63,54,3,517,2021-06-03 19:31:15,att services inc,southwest,california,MOBL,DESKTOP,UN_PWD,ALLOW,33579,FL,2021-05-25 08:50:05,1984-10-27,2021-05-16 12:31:15,2021-05-16,1
13998,491.64,3163.25,21,3,0,2020-03-02 11:34:54,,,,ONLN,DESKTOP,UN_PWD,ALLOW,91702,CA,NaT,2021-03-01,2021-05-11 12:34:54,2021-05-11,1


In [7]:
#Similar data cleaning process as above
DfProc = trainDf

carrMap = {
    'cox communications inc.' : 'cox',
    't-mobile usa  inc.' : 'tmobile',
    'charter communications inc' : 'charter',
    'comcast' : 'comcast',
    'comcast cable communications  llc' : 'comcast',
    'centurylink communications  llc' : 'century',
    'frontier communications of america  inc.' : 'frontier',
    'att services inc' : 'att',
    'charter communications' : 'charter',
    'at&t mobility llc ' : 'att',
    'cellco partnership dba verizon wireless' : 'verizon',
}

regionSet = { 'southwest', 'south central', 'southeast', 'mountain',
             'northeast', 'great lakes', 'mid atlantic', 'pacific northwest',
             'midwest'}
regionMap = {x:x for x in regionSet}
    

DfProc['CARR_NAME'] = DfProc['CARR_NAME'].map(carrMap).fillna("other")
DfProc['RGN_NAME'] = DfProc['RGN_NAME'].map(regionMap).fillna("other")

#ADDITIONAL FEATURE ENGINEERING - - - - - - - - - - - - 

#Normalize date features against transaction date
# How old was the account when it made the transaction
DfProc['DAY_ACC_AGE'] = (
    DfProc['TRAN_TS'] - DfProc['CUST_SINCE_DT']).dt.days
# How long was it been since the phone number was updated since the transaction
DfProc['DAY_FRM_NUM_UPDT'] = (
    DfProc['TRAN_TS'] - DfProc['PH_NUM_UPDT_TS']).dt.days
# How long was it been since the password was updated since the transaction
DfProc['DAY_FRM_PWD_UPDT'] = (
    DfProc['TRAN_TS'] - DfProc['PWD_UPDT_TS']).dt.days

# Cleaning "region" column to match entries in state column.
# States were mapped to their abbreviations, if state outside US its mapped to
# "INT" for international
stateDict = {'nevada' : 'NV', 'california': 'CA', 'utah': 'UT', 'texas': 'TX','arizona': 'AZ', 'wisconsin': 'WI', 'minnesota': 'MN', 'phnum penh' : 'INT','alabama': 'AL', 'florida': 'FL', 'nebraska': 'NE', 'south dakota': 'SD',
 'punjab': 'INT', 'north carolina': 'NC', 'new york': 'NY', 'michigan': 'MI','colorado': 'CO', 'massachusetts': 'MA', 'antioquia': 'INT', 'washington': 'WA','arkansas': 'AR', 'new jersey': 'NJ', 'kentucky': 'KY', 'ostergotlands lan': 'INT',
 'tennessee': 'TN', 'district of columbia': 'DC', 'georgia': 'GA', 'maryland': 'MD','oregon': 'OR', 'wyoming': 'WY', 'oklahoma': 'OK', 'illinois': 'IL','north dakota': 'ND', 'indiana': 'IN', 'pennsylvania': 'PA', 'distrito nacional': 'INT',
 'distrito capital': 'INT', 'iowa': 'IA', 'zuerich': 'INT', 'hamerkaz': 'INT','sonora': 'INT', 'madrid': 'INT', 'new mexico': 'NM', 'new south wales' : 'INT','loire-atlantique' : 'INT', 'carabobo' : 'INT', 'montana' : 'MT', 'idaho' : 'ID',
 'hong kong' : 'INT', 'ohio' : 'OH', 'south carolina': 'SC', 'missouri': 'MS', 'colima': 'INT', 'baja california': 'INT', 'noord-brabant': 'INT', 'nairobi area': 'INT', 'baden-wuerttemberg': 'INT', 'virginia' : 'VA','alaska': 'AK', 'hawaii': 'HI', 'kansas': 'KS', 'greater accra': 'INT', 'kingston': 'INT', 'connecticut' : 'CT', 'louisiana': 'LA', 'bolivar': 'INT',
 'lagos': 'INT', 'gujarat': 'INT', 'zulia': 'INT', 'morelos': 'INT', 'jalisco': 'INT', 'san salvador': 'INT', 'west bengal': 'INT', 'guerrero': 'INT', 'distrito federal': 'INT',
 'mississippi': 'MS', "saint george's": 'INT', 'hampshire': 'NH', 'paris': 'INT','mazowieckie': 'INT', 'region metropolitana': 'INT', 'ha noi': 'INT', 'lara': 'INT','maine': 'ME', 'seoul teukbyeolsi': 'INT', 'telangana': 'INT', 'victoria': 'INT',
 'kinshasa': 'INT', 'aguascalientes': 'INT', 'western australia': 'INT','andhra pradesh': 'INT', 'sao paulo': 'INT', 'nueva esparta': 'INT','dubayy': 'INT', 'chihuahua': 'INT', 'rhode island': 'ri', 'istanbul': 'INT','guatemala': 'INT', 'gauteng': 'INT', 'michoacan de ocampo': 'INT', "ra's al khaymah": 'INT',
 'sodermanlands lan': 'INT', 'da nang': 'INT', 'taipei': 'INT','sindh': 'INT','tamaulipas': 'INT','sinaloa': 'INT','liverpool': 'INT','western cape': 'INT', 'aragua': 'INT', 'british columbia': 'INT', 'guanacaste': 'INT','`amman': 'INT',
 'hessen': 'INT','ontario': 'INT','delaware': 'DE', 'dublin': 'INT', 'south west': 'INT', 'west virginia': 'WV', 'south australia': 'INT', 'delhi': 'INT', 'pichincha': 'INT', 'new providence': 'INT', 'tokyo': 'INT', 'nordrhein-westfalen' : 'INT'}

# Use statedict to create column to describe where transaction originated from
DfProc['TXT_STATE'] = DfProc['STATE_PRVNC_TXT'].map(stateDict).fillna("None")

#Function to apply to column of transaction location and customer location 
#To compare if the two match
def locationCompare(txtLoc, custLoc):
  if txtLoc != custLoc:
    if txtLoc == 'INT':
      return 'INT'
    else:
      return 'MISMATCH'
  return 'MATCH'

#Apply functino above to TXT state and CUST state column
DfProc['TXT_CASE'] = DfProc.apply(
    lambda x: locationCompare(x['TXT_STATE'], x['CUST_STATE']), axis=1)


## This didn't really help Josh so let's skip it
# #Read in external dataframe with data for each zip code
# zipInfoDf = pd.read_csv('zip_code_rural.csv')
# #Get population number (zpop) and population density (lzden )for each zip code
# zipInfoDf = zipInfoDf[['zip', 'zpop', 'lzden']]
# #Add this information to df
# DfProc = DfProc.merge(zipInfoDf, how='left', left_on='CUST_ZIP', right_on='zip')


#Similar process to above, we end up keeping the generated features
#And removing a lot of the really detailed categorical variables
categorical = ['CARR_NAME', 'RGN_NAME', 'STATE_PRVNC_TXT', 'ALERT_TRGR_CD',
                'DVC_TYPE_TXT', 'AUTHC_PRIM_TYPE_CD', 'AUTHC_SCNDRY_STAT_TXT',
                'CUST_STATE', 'TXT_CASE']

remove = ['PWD_UPDT_TS', 'PH_NUM_UPDT_TS', 'CUST_SINCE_DT','TRAN_TS','TRAN_DT',
          'CUST_STATE', 'STATE_PRVNC_TXT', 'TXT_STATE', 'CUST_ZIP', 'zip']


categoricalDummies = [x for x in categorical if x not in remove]

for var in categoricalDummies:
    cat_list = pd.get_dummies(DfProc[var], prefix=var)
    DfProc=DfProc.join(cat_list)
data_vars=DfProc.columns.values.tolist()
to_keep=[i for i in data_vars if i not in categorical and i not in remove]
DfProc=DfProc[to_keep]

DfProc

Unnamed: 0,TRAN_AMT,ACCT_PRE_TRAN_AVAIL_BAL,CUST_AGE,OPEN_ACCT_CT,WF_dvc_age,FRAUD_NONFRAUD,DAY_ACC_AGE,DAY_FRM_NUM_UPDT,DAY_FRM_PWD_UPDT,CARR_NAME_att,...,AUTHC_PRIM_TYPE_CD_AFA_PL,AUTHC_PRIM_TYPE_CD_FACE_ID,AUTHC_PRIM_TYPE_CD_TOUCH_ID,AUTHC_PRIM_TYPE_CD_UN_PWD,AUTHC_SCNDRY_STAT_TXT_ALLOW,AUTHC_SCNDRY_STAT_TXT_CHALLENGE_ISSUED,AUTHC_SCNDRY_STAT_TXT_CHALLENGE_SUCCESS,TXT_CASE_INT,TXT_CASE_MATCH,TXT_CASE_MISMATCH
0,5.38,23619.91,47,4,2777,0,10344,68.0,1203.0,0,...,0,0,0,1,1,0,0,0,1,0
1,65.19,0.00,45,5,2721,0,18269,,,0,...,0,1,0,0,1,0,0,0,1,0
2,54.84,34570.63,36,8,1531,1,9928,704.0,-259.0,0,...,0,0,0,1,1,0,0,0,0,1
3,0.01,0.00,62,3,835,0,7222,906.0,549.0,0,...,0,0,0,1,1,0,0,0,0,1
4,497.08,12725.18,81,2,1095,1,12559,415.0,180.0,0,...,0,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,1937.21,230.75,55,4,142,0,7220,1336.0,,0,...,0,0,0,1,1,0,0,0,1,0
13996,114.38,0.00,44,10,272,0,4026,,1317.0,0,...,0,1,0,0,1,0,0,0,0,1
13997,493.00,2848.63,54,3,517,1,13350,-9.0,-19.0,1,...,0,0,0,1,1,0,0,0,0,1
13998,491.64,3163.25,21,3,0,1,71,,435.0,0,...,0,0,0,1,1,0,0,0,0,1


In [14]:
X = DfProc.loc[:, DfProc.columns != 'FRAUD_NONFRAUD'].astype(float)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))
X_mean_imputed = X.fillna(X.mean())
X_mean_imputed_numpy = X_mean_imputed.to_numpy()


Y = DfProc['FRAUD_NONFRAUD']
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

X_mean_imputed_numpy

array([[-0.84936742,  0.44672723, -0.36842923, ..., -0.13290498,
         2.23798653, -2.10739634],
       [-0.66121524, -0.34148634, -0.47464743, ..., -0.13290498,
         2.23798653, -2.10739634],
       [-0.69377459,  0.81216071, -0.95262935, ..., -0.13290498,
        -0.44683021,  0.47451919],
       ...,
       [ 0.68460294, -0.24642549,  0.00333449, ..., -0.13290498,
        -0.44683021,  0.47451919],
       [ 0.68032461, -0.23592639, -1.74926589, ..., -0.13290498,
        -0.44683021,  0.47451919],
       [-0.84735409, -0.34148634,  0.32198911, ..., -0.13290498,
        -0.44683021,  0.47451919]])

In [15]:
# Need a validation set for this, Keras can wrap tabnet and do cv but not sure needed
x_train, x_val, y_train, y_val = train_test_split(X_mean_imputed_numpy, encoded_Y, test_size=0.30, random_state=8)
x_train

array([[-0.86626054, -0.34148634, -1.74926589, ..., -0.13290498,
         2.23798653, -2.10739634],
       [-0.86626054, -0.30758502, -1.32439307, ..., -0.13290498,
         2.23798653, -2.10739634],
       [-0.86626054, -0.34148634, -0.63397474, ..., -0.13290498,
        -0.44683021,  0.47451919],
       ...,
       [-0.72073437, -0.34148634, -0.26221102, ..., -0.13290498,
        -0.44683021,  0.47451919],
       [-0.60625756, -0.27989632, -0.89952025, ..., -0.13290498,
        -0.44683021,  0.47451919],
       [-0.86355512, -0.05314057,  1.06551654, ..., -0.13290498,
         2.23798653, -2.10739634]])

In [17]:
# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax', # "sparsemax",
    )

unsupervised_model.fit(
    x_train,
    eval_set=[x_val],
    max_epochs=500 , patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    pretraining_ratio=0.8,
)

# reconstructed_X, embedded_X = unsupervised_model_no_preproc.predict(x_val)
# assert(reconstructed_X.shape==embedded_X.shape)

Device used : cpu
epoch 0  | loss: 1.63437 | val_0_unsup_loss: 1.06263 |  0:00:01s
epoch 1  | loss: 1.06488 | val_0_unsup_loss: 1.01474 |  0:00:02s
epoch 2  | loss: 1.01182 | val_0_unsup_loss: 1.00483 |  0:00:03s
epoch 3  | loss: 1.00029 | val_0_unsup_loss: 1.00674 |  0:00:04s
epoch 4  | loss: 0.99748 | val_0_unsup_loss: 1.00457 |  0:00:05s
epoch 5  | loss: 1.00274 | val_0_unsup_loss: 1.00482 |  0:00:06s
epoch 6  | loss: 0.99903 | val_0_unsup_loss: 1.00418 |  0:00:07s
epoch 7  | loss: 0.99726 | val_0_unsup_loss: 1.00018 |  0:00:08s
epoch 8  | loss: 1.00386 | val_0_unsup_loss: 0.99921 |  0:00:09s
epoch 9  | loss: 0.99461 | val_0_unsup_loss: 0.99196 |  0:00:10s
epoch 10 | loss: 1.00099 | val_0_unsup_loss: 0.98488 |  0:00:11s
epoch 11 | loss: 0.98934 | val_0_unsup_loss: 0.97643 |  0:00:12s
epoch 12 | loss: 0.99411 | val_0_unsup_loss: 0.9732  |  0:00:13s
epoch 13 | loss: 0.99425 | val_0_unsup_loss: 0.96968 |  0:00:14s
epoch 14 | loss: 0.98987 | val_0_unsup_loss: 0.96625 |  0:00:15s
epoch 1

epoch 126| loss: 0.95096 | val_0_unsup_loss: 0.87397 |  0:02:06s
epoch 127| loss: 0.94914 | val_0_unsup_loss: 0.87337 |  0:02:07s
epoch 128| loss: 0.95206 | val_0_unsup_loss: 0.87053 |  0:02:08s
epoch 129| loss: 0.95128 | val_0_unsup_loss: 0.87246 |  0:02:09s
epoch 130| loss: 0.95886 | val_0_unsup_loss: 0.87574 |  0:02:10s
epoch 131| loss: 0.9508  | val_0_unsup_loss: 0.87795 |  0:02:11s
epoch 132| loss: 0.95847 | val_0_unsup_loss: 0.87887 |  0:02:12s
epoch 133| loss: 0.95023 | val_0_unsup_loss: 0.87973 |  0:02:13s
epoch 134| loss: 888.01164| val_0_unsup_loss: 0.89124 |  0:02:14s
epoch 135| loss: 8071.57349| val_0_unsup_loss: 0.89848 |  0:02:15s
epoch 136| loss: 0.96304 | val_0_unsup_loss: 0.88942 |  0:02:16s
epoch 137| loss: 0.95876 | val_0_unsup_loss: 0.88175 |  0:02:17s
epoch 138| loss: 0.95981 | val_0_unsup_loss: 0.87952 |  0:02:18s
epoch 139| loss: 0.96669 | val_0_unsup_loss: 0.87957 |  0:02:19s
epoch 140| loss: 0.9612  | val_0_unsup_loss: 0.87647 |  0:02:20s
epoch 141| loss: 0.957

In [18]:
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, # how to use learning rate scheduler
                      "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax' # This will be overwritten if using pretrain model
)

clf.fit(
    X_train= x_train, y_train=y_train,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    from_unsupervised=unsupervised_model,
    max_epochs=500 , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)

Device used : cpu
Loading weights from unsupervised pretraining




epoch 0  | loss: 0.65099 | train_auc: 0.82551 | valid_auc: 0.81725 |  0:00:01s
epoch 1  | loss: 0.50575 | train_auc: 0.86396 | valid_auc: 0.85007 |  0:00:02s
epoch 2  | loss: 0.40237 | train_auc: 0.91692 | valid_auc: 0.90864 |  0:00:04s
epoch 3  | loss: 0.3359  | train_auc: 0.93406 | valid_auc: 0.92654 |  0:00:05s
epoch 4  | loss: 0.32699 | train_auc: 0.94013 | valid_auc: 0.93167 |  0:00:07s
epoch 5  | loss: 0.3181  | train_auc: 0.94554 | valid_auc: 0.93809 |  0:00:08s
epoch 6  | loss: 0.28882 | train_auc: 0.9481  | valid_auc: 0.94088 |  0:00:09s
epoch 7  | loss: 0.28351 | train_auc: 0.95196 | valid_auc: 0.94485 |  0:00:11s
epoch 8  | loss: 0.27955 | train_auc: 0.95333 | valid_auc: 0.94312 |  0:00:12s
epoch 9  | loss: 0.27884 | train_auc: 0.95398 | valid_auc: 0.94602 |  0:00:13s
epoch 10 | loss: 0.28156 | train_auc: 0.95532 | valid_auc: 0.94706 |  0:00:15s
epoch 11 | loss: 0.27916 | train_auc: 0.95546 | valid_auc: 0.949   |  0:00:16s
epoch 12 | loss: 0.27528 | train_auc: 0.95913 | vali

In [19]:
# Predicting the Value
x_val_pred = clf.predict_proba(x_val)[:, 1]

Baseline: 84.02%


In [29]:
# Getting it into a form that'll be gradeable
x_val_pred_label = x_val_pred > 0.5
x_val_pred_label = x_val_pred_label.astype(int)
x_val_pred_label_df = pd.DataFrame(x_val_pred_label)
y_val_df = pd.DataFrame(y_val)

f1s = f1_score(y_val_df, x_val_pred_label_df)
print("Baseline: %.2f%%" % (f1s*100))

Baseline: 87.60%
