# IEEE Fraud Detection Using Catboost
The below model is based on catboost

In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef
import seaborn as sns

import datetime, random

import hyperopt
from numpy.random import RandomState

In [2]:
pd.options.display.max_columns = None
!jupyter nbextension enable --py widgetsnbextension
DATA_DIR='../data/raw'

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
def gen_seeds(seed=0):
    '''
    Ensure seeds are set on anything that needs it. 
    Passing in the same value here would make the process deterministic
    '''
    random.seed(seed)
    np.random.seed(seed)

In [4]:
def get_x_y(df):
    df = df.replace(np.nan, '', regex=True)
    X = df.drop('isFraud', axis=1)
    y = df.isFraud    
    return X, y

# Data

In [5]:
train_identity = pd.read_csv(DATA_DIR + '/train_identity.csv')
train_transaction = pd.read_csv(DATA_DIR + '/train_transaction.csv')
test_identity = pd.read_csv(DATA_DIR + '/test_identity.csv')
test_transaction = pd.read_csv(DATA_DIR + '/test_transaction.csv')

In [6]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

# Feature Engineering

In [7]:
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [8]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

In [9]:
def add_means(df, in_col, col_to_aggregate='TransactionAmt'):
    df[col_to_aggregate+'_to_mean_'+in_col] = df[col_to_aggregate] - df.groupby([in_col])[col_to_aggregate].transform('mean')
    df[col_to_aggregate+'_to_std_'+in_col] = df[col_to_aggregate+'_to_mean_'+in_col] / df.groupby([in_col])[col_to_aggregate].transform('std')


In [10]:
def engineer_features(df, engineer_identity_features=True):
    
    # First some date fields relative to an estimated start date
    df['Date'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = (df['Date'].dt.year-2017)*12 + df['Date'].dt.month 
    df['DT_W'] = df['Date'].dt.dayofweek
    df['DT_H'] = df['Date'].dt.hour
    df['DT_D'] = df['Date'].dt.day
    
    # Log transform the TX amount so its normally distributed. Oddly keeping both cols gives better performance..
    df['TransactionAmt'] = np.log(df['TransactionAmt'])
    
    # Bin the emails
    for c in ['P_emaildomain', 'R_emaildomain']:
        df[c + '_bin'] = df[c].map(emails)
        df[c + '_suffix'] = df[c].map(lambda x: str(x).split('.')[-1])
        df[c + '_suffix'] = df[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    
    add_means(df, 'card1')
    add_means(df, 'card2')
    add_means(df, 'card3')
    add_means(df, 'card4')
    
    add_means(df, 'card1', col_to_aggregate='id_02')
    add_means(df, 'card4', col_to_aggregate='id_02')
    
    add_means(df, 'card1', col_to_aggregate='D15')
    add_means(df, 'card4', col_to_aggregate='D15')
    
    add_means(df, 'addr1', col_to_aggregate='D15')
    add_means(df, 'addr2', col_to_aggregate='D15')
    
    # Drop some columns. These came from EDA analysis
    useful_features = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1',
                   'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
                   'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3',
                   'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V17',
                   'V19', 'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V44', 'V45', 'V46', 'V47', 'V48',
                   'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V71',
                   'V72', 'V73', 'V74', 'V75', 'V76', 'V78', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V87', 'V90', 'V91', 'V92',
                   'V93', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V138', 'V139', 'V140',
                   'V143', 'V145', 'V146', 'V147', 'V149', 'V150', 'V151', 'V152', 'V154', 'V156', 'V158', 'V159', 'V160', 'V161',
                   'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V172', 'V173', 'V175', 'V176', 'V177',
                   'V178', 'V180', 'V182', 'V184', 'V187', 'V188', 'V189', 'V195', 'V197', 'V200', 'V201', 'V202', 'V203', 'V204',
                   'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V219', 'V220',
                   'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V231', 'V233', 'V234', 'V238', 'V239',
                   'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V249', 'V251', 'V253', 'V256', 'V257', 'V258', 'V259', 'V261',
                   'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276',
                   'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V292', 'V294', 'V303',
                   'V304', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V322', 'V323', 'V324', 'V326',
                   'V329', 'V331', 'V332', 'V333', 'V335', 'V336', 'V338', 'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
                   'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33',
                   'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']
    cols_to_drop = [col for col in train.columns if col not in useful_features]
    cols_to_drop.remove('isFraud')
    cols_to_drop.remove('TransactionID')
    df.drop(cols_to_drop, axis=1)
    #df.drop('Date',axis=1, inplace=True)
    
    if engineer_identity_features:
        for x in range(1, 12):
            df['id_'+str(x).zfill(2)] = np.log(df['id_'+str(x).zfill(2)])
        add_means(df, 'card1', col_to_aggregate='id_02')
        add_means(df, 'card4', col_to_aggregate='id_02')
        
    
    

In [11]:
engineer_features(train)
engineer_features(test)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [12]:
train.shape

(590540, 463)

In [13]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,Date,DT_M,DT_W,DT_H,DT_D,P_emaildomain_bin,P_emaildomain_suffix,R_emaildomain_bin,R_emaildomain_suffix,TransactionAmt_to_mean_card1,TransactionAmt_to_std_card1,TransactionAmt_to_mean_card2,TransactionAmt_to_std_card2,TransactionAmt_to_mean_card3,TransactionAmt_to_std_card3,TransactionAmt_to_mean_card4,TransactionAmt_to_std_card4,id_02_to_mean_card1,id_02_to_std_card1,id_02_to_mean_card4,id_02_to_std_card4,D15_to_mean_card1,D15_to_std_card1,D15_to_mean_card4,D15_to_std_card4,D15_to_mean_addr1,D15_to_std_addr1,D15_to_mean_addr2,D15_to_std_addr2
0,2987000,0,86400,4.226834,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-12-01 00:00:00,12,4,0,1,,,,,-1.107083,-1.051267,,,-0.253728,-0.281529,-0.695417,-0.628262,,,,,-82.441176,-0.451466,-135.030113,-0.705721,-207.477318,-0.957673,-182.991449,-0.884313
1,2987001,0,86401,3.367296,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-12-01 00:00:01,12,4,0,1,google,com,,,-1.46424,-1.497651,-1.474868,-1.431261,-1.113266,-1.235247,-0.95698,-0.987085,,,,,-143.879538,-0.721863,-154.154356,-0.775482,-207.555433,-0.969125,-182.991449,-0.884313
2,2987002,0,86469,4.077537,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,,,315.0,T,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-12-01 00:01:09,12,4,0,1,microsoft,com,,,-0.16926,-0.223029,-0.364999,-0.428238,-0.403024,-0.447184,-0.285551,-0.303643,,,,,189.929658,1.116073,146.182045,0.71534,119.53295,0.564071,132.008551,0.637936
3,2987003,0,86499,3.912023,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,84.0,,,,,111.0,,,,M0,T,F,,,,,,,,,,,,,,,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,48.0,28.0,0.0,10.0,4.0,1.0,38.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0,1758.0,925.0,0.0,354.0,135.0,50.0,1404.0,790.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,28.0,0.0,0.0,0.0,0.0,10.0,0.0,4.0,0.0,0.0,1.0,1.0,1.0,1.0,38.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,1758.0,925.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-12-01 00:01:39,12,4,0,1,yahoo,com,,,-0.452859,-0.53718,-0.516766,-0.598398,-0.568538,-0.630834,-0.412253,-0.425222,,,,,-90.718321,-0.425421,-43.154356,-0.217091,-50.767764,-0.263735,-71.991449,-0.347901
4,2987004,0,86506,3.912023,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,1803.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15557.990234,169690.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-inf,11.167431,,,,,,,,,4.60517,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,,,,,,,,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,2017-12-01 00:01:46,12,4,0,1,google,com,,,-0.44841,-0.612692,-0.87546,-0.859643,-0.568538,-0.630834,-0.412253,-0.425222,-0.199862,-0.473017,-0.452493,-0.330181,,,,,,,,


In [14]:
X_test = test.replace(np.nan, '', regex=True)
X, y = get_x_y(train)

In [15]:
cols_to_find = [ 
# TX features
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
# Identity features
  'DeviceType',
  'DeviceInfo',
  'id_12',  'id_13',  'id_14', 'id_15',  'id_16',  'id_17',  'id_18',  'id_19',
  'id_20',  'id_21',  'id_22',  'id_23',  'id_24',  'id_25',  'id_26',  'id_27',  'id_28',  'id_29',
  'id_30',  'id_31',  'id_32',  'id_33',  'id_34',  'id_35',  'id_36',  'id_37',  'id_38',
  
# Engineered features
 'P_emaildomain_bin',
 'P_emaildomain_suffix',
 'R_emaildomain_bin',
 'R_emaildomain_suffix'
]

categorical_features_indices = [X.columns.get_loc(col) for col in cols_to_find]

## LBO

First lets get a new column for months from some arbitrary start date

and now create datasets using a "leave block out (LBO)"" split

In [16]:
main_train_set = train[train['DT_M']<(train['DT_M'].max())].reset_index(drop=True)
validation_set = train[train['DT_M']==train['DT_M'].max()].reset_index(drop=True)

print ("Training shape: %s, validation shape: %s"%(main_train_set.shape, validation_set.shape))
X, y = get_x_y(main_train_set)
X_valid, y_valid = get_x_y(validation_set)
train_pool = Pool(X, y, cat_features=categorical_features_indices)
validate_pool = Pool(X_valid, y_valid, cat_features=categorical_features_indices)

Training shape: (501214, 463), validation shape: (89326, 463)


First lets check the target split

In [17]:
target_vals=train_transaction.groupby('isFraud').count()[['TransactionID']]
target_vals_pcts = target_vals.apply(lambda x: x/x.sum())
print('Train: No Fraud: %s, Fraud: %s' % (target_vals_pcts.iloc[0,0], target_vals_pcts.iloc[1,0]) )

target_vals=validation_set.groupby('isFraud').count()[['TransactionID']]
target_vals_pcts = target_vals.apply(lambda x: x/x.sum())
print('Validation: No Fraud: %s, Fraud: %s' % (target_vals_pcts.iloc[0,0], target_vals_pcts.iloc[1,0]) )
train_transaction.groupby('isFraud').count()[['TransactionID']]

Train: No Fraud: 0.9650099908558268, Fraud: 0.03499000914417313
Validation: No Fraud: 0.965138929315093, Fraud: 0.03486107068490697


Unnamed: 0_level_0,TransactionID
isFraud,Unnamed: 1_level_1
0,569877
1,20663


So we need to be scoring better than about 96.5% accuracy. If not we could simple predict no fraud for 100%..

In [None]:
params = {
    'loss_function': 'Logloss',
    'iterations': 500,
    'learning_rate': 0.3642870155360327,
    'l2_leaf_reg': 3,
    'custom_metric': ['Accuracy', 'Recall', 'F1','MCC'],
    'eval_metric': 'MCC',
    #'eval_metric': 'F1',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False,
    'early_stopping_rounds': 30,
    #'od_type': 'Iter',
    #'od_wait': 40,
    'class_weights': [1,2],
    'depth': 8
}

In [None]:
SEED = 42

gen_seeds(SEED)
model = CatBoostClassifier(**params)

model.fit(
    train_pool,
    eval_set=validate_pool,
    #logging_level='Info' #='Verbose',
    plot=True
);

# Average prediction results over folds
preds_proba = model.predict_proba(X_valid)[:,1]
preds = model.predict(X_valid)
    
print('Accuracy', accuracy_score(y_valid, preds))
print('AUC score', roc_auc_score(y_valid, preds))
print('F1 score', f1_score(y_valid, preds.round()))
print('MCC score', matthews_corrcoef(y_valid, preds.round()))

# Hyperparameter tuning

In [22]:
def hyperopt_objective(params):
    print('Params: '+str(params))
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        loss_function=str(params['loss_function']),
        iterations=500,
        eval_metric=str(params['eval_metric']),
        random_seed=42,
        logging_level='Silent',
        custom_metric=['F1','MCC'],
        use_best_model=True,
        #'early_stopping_rounds': 30,
        od_type= 'Iter',
        od_wait= 40,
        depth=int(params['depth'])
    )
    
    model.fit(
        train_pool,
        eval_set=validate_pool    
    );

    # Get results and scores
    preds = model.predict(X_valid)
    acc_score = accuracy_score(y_valid, preds)
    auc_score = roc_auc_score(y_valid, preds)
    f1 = f1_score(y_valid, preds.round())
    mcc = matthews_corrcoef(y_valid, preds.round())
    print("Accuracy score: %s, AUC: %s, F1: %s, MCC: %s" % (acc_score, auc_score,f1, mcc))
    
    return 1 - auc_score # as hyperopt minimises

In [23]:
params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
    #'eval_metric': hyperopt.hp.choice('eval_metric',['F1', 'MCC', 'Accuracy'])
    'loss_function': hyperopt.hp.choice('loss_function',['CrossEntropy','Logloss']),
    'eval_metric': hyperopt.hp.choice('eval_metric',['MCC','AUC']),
    'depth': hyperopt.hp.quniform('depth', 4,10,1)
    
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

Params: {'depth': 8.0, 'eval_metric': 'AUC', 'l2_leaf_reg': 6.0, 'learning_rate': 0.38214701126985373, 'loss_function': 'CrossEntropy'}
Accuracy score: 0.9752255782191075, AUC: 0.7068849798979505, F1: 0.540776094625441, MCC: 0.554671468597713
Params: {'depth': 7.0, 'eval_metric': 'MCC', 'l2_leaf_reg': 5.0, 'learning_rate': 0.41259684270951374, 'loss_function': 'CrossEntropy'}
Accuracy score: 0.9754382822470501, AUC: 0.721078836859897, F1: 0.5596146126053794, MCC: 0.5668029790064719
Params: {'depth': 4.0, 'eval_metric': 'MCC', 'l2_leaf_reg': 4.0, 'learning_rate': 0.1876331735386023, 'loss_function': 'Logloss'}
Accuracy score: 0.9753935024516938, AUC: 0.699388463640332, F1: 0.5329366765830853, MCC: 0.5528632087646062
Params: {'depth': 5.0, 'eval_metric': 'MCC', 'l2_leaf_reg': 5.0, 'learning_rate': 0.4644615965747169, 'loss_function': 'CrossEntropy'}
Accuracy score: 0.9754718670935674, AUC: 0.7149056145176141, F1: 0.5527658705858338, MCC: 0.5633730395535738
Params: {'depth': 8.0, 'eval_me

KeyboardInterrupt: 

# A new model

In [None]:
params = {
    'loss_function': 'Logloss',
    'iterations': 500,
    'learning_rate': 0.10263297186425369,
    'l2_leaf_reg': 4,
    'custom_metric': ['Accuracy', 'Recall', 'F1','MCC'],
    'eval_metric': 'MCC',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False,
    #'early_stopping_rounds': 30,
    'od_type': 'Iter',
    'od_wait': 40,
    #'class_weights': [1,2],
    'depth': 10
}

In [None]:
SEED = 42

gen_seeds(SEED)
model = CatBoostClassifier(**params)

model.fit(
    train_pool,
    eval_set=validate_pool,
    #logging_level='Info' #='Verbose',
    plot=True
);

preds_proba = model.predict_proba(X_valid)[:,1]
preds = model.predict(X_valid)
    
print('Accuracy', accuracy_score(y_valid, preds))
print('AUC score', roc_auc_score(y_valid, preds))
print('F1 score', f1_score(y_valid, preds.round()))
print('MCC score', matthews_corrcoef(y_valid, preds.round()))

In [None]:
model.save_model( '/Users/lsmith/Projects/Kaggle/ieee-fraud-detection/models/model_v5.cbm')

# Predictions

In [None]:
predictions_probs = model.predict_proba(X_test)[:,1]
print(list(predictions_probs[:10]) )

Lets clip the preds as logloss penalise incorrect strong predictions (eg 0 or 1), so clipping will avoid this.

In [None]:
predictions_probs = np.clip(predictions_probs, 0.05, 0.95)
predictions_probs = np.around(predictions_probs, 6)

print(list(predictions_probs[:10]) )

In [None]:
df_sub = pd.read_csv(DATA_DIR + '/sample_submission.csv')
df_sub.shape

In [None]:
df_sub['isFraud'] = predictions_probs
df_sub.head()

In [None]:
df_sub.to_csv(DATA_DIR + '/submission_v1.csv', index=False)