https://www.kaggle.com/kyakovlev/ieee-data-minification

In [1]:
for name in dir():
 if not name.startswith("_"):
   del globals()[name]

In [2]:
#Import necessary packages
import numpy as np
import pandas as pd
import datetime as dt
import os
import matplotlib.pyplot as plt
import graphviz
import itertools
import pickle
import random

from scipy.stats import ks_2samp
from scipy.stats import chi2_contingency
from scipy.stats import chi2

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from scipy import interp
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from joblib import Parallel, delayed

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
########################### Helpers
#################################################################################
## -------------------
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
## ------------------- 

## -------------------
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        print(col)
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
## -------------------

In [4]:
########################### Vars
#################################################################################
SEED = 42
seed_everything(SEED)
LOCAL_TEST = False

In [57]:
%%time
########################### Base Minification
#################################################################################
if 0:
    df_train_transaction = pd.read_csv('./data/train_transaction.csv', header='infer')
    df_test_transaction = pd.read_csv('./data/test_transaction.csv', header='infer')
    df_test_transaction['isFraud'] = 0
    df_train_identity = pd.read_csv('./data/train_identity.csv', header='infer')
    df_test_identity = pd.read_csv('./data/test_identity.csv', header='infer')

    df_train_transaction = reduce_mem_usage(df_train_transaction)
    df_test_transaction  = reduce_mem_usage(df_test_transaction)
    df_train_identity = reduce_mem_usage(df_train_identity)
    df_test_identity  = reduce_mem_usage(df_test_identity)
    
    df_train_transaction.to_pickle('./data/pickles/approach3/df_train_transaction.pkl')
    df_test_transaction.to_pickle('./data/pickles/approach3/df_test_transaction.pkl')
    df_train_identity.to_pickle('./data/pickles/approach3/df_train_identity.pkl')
    df_test_identity.to_pickle('./data/pickles/approach3/df_test_identity.pkl')
    
else:
    df_train_transaction = pd.read_pickle('./data/pickles/approach3/df_train_transaction.pkl')
    df_test_transaction = pd.read_pickle('./data/pickles/approach3/df_test_transaction.pkl')
    df_train_identity = pd.read_pickle('./data/pickles/approach3/df_train_identity.pkl')
    df_test_identity = pd.read_pickle('./data/pickles/approach3/df_test_identity.pkl')

CPU times: user 779 ms, sys: 361 ms, total: 1.14 s
Wall time: 1.16 s


In [58]:
print(df_train_transaction.shape)
print(df_test_transaction.shape)
print(df_train_identity.shape)
print(df_test_identity.shape)

(590540, 394)
(506691, 394)
(144233, 41)
(141907, 41)


In [59]:
df_train_transaction_transformed = df_train_transaction.copy()
df_test_transaction_transformed = df_test_transaction.copy()
df_train_identity_transformed = df_train_identity.copy()
df_test_identity_transformed = df_test_identity.copy()

In [60]:
%%time
df_train_full = pd.merge(df_train_transaction, df_train_identity, how='left', on='TransactionID')

CPU times: user 7.27 s, sys: 1.06 s, total: 8.33 s
Wall time: 8.33 s


In [61]:
print(df_train_full.shape)
df_train_full.head()

(590540, 434)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,,,315.0,T,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,84.0,,,,,111.0,,,,M0,T,F,,,,,,,,,,,,,,,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,48.0,28.0,0.0,10.0,4.0,1.0,38.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0,1758.0,925.0,0.0,354.0,135.0,50.0,1404.0,790.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,28.0,0.0,0.0,0.0,0.0,10.0,0.0,4.0,0.0,0.0,1.0,1.0,1.0,1.0,38.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,1758.0,925.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,1803.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15560.0,169690.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,,,,,,,,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [62]:
df_train_full.dtypes

TransactionID       int32
isFraud              int8
TransactionDT       int32
TransactionAmt    float16
ProductCD          object
card1               int16
card2             float16
card3             float16
card4              object
card5             float16
card6              object
addr1             float16
addr2             float16
dist1             float16
dist2             float16
P_emaildomain      object
R_emaildomain      object
C1                float16
C2                float16
C3                float16
C4                float16
C5                float16
C6                float16
C7                float16
C8                float16
C9                float16
C10               float16
C11               float16
C12               float16
C13               float16
C14               float16
D1                float16
D2                float16
D3                float16
D4                float16
D5                float16
D6                float16
D7                float16
D8          

In [14]:
def check_missing(df):
    df_missing = pd.concat([df.nunique(), df.dtypes, df.isnull().sum()], axis=1)
    df_missing.columns = ['nunique', 'dtype', 'isnull_count']
    df_missing['missing'] = df_missing['isnull_count'] / float(len(df))
    return (df_missing)

In [15]:
%%time
if 0:
    df0 = df_train_full[ df_train_full['isFraud'] == 0 ]
    df0_missing = check_missing(df0)

    df1 = df_train_full[ df_train_full['isFraud'] == 1 ]
    df1_missing = check_missing(df1)
    df_missing = pd.merge(df0_missing, df1_missing, how="left", left_index=True, right_index=True)
    df_missing.to_pickle('./data/pickles/approach3/df_missing.pkl')
else:
    df_missing = pd.read_pickle('./data/pickles/approach3/df_missing.pkl')

CPU times: user 1.92 ms, sys: 1.55 ms, total: 3.47 ms
Wall time: 3.3 ms


In [16]:
print(df_missing.shape)
df_missing

(434, 8)


Unnamed: 0,nunique_x,dtype_x,isnull_count_x,missing_x,nunique_y,dtype_y,isnull_count_y,missing_y
TransactionID,569877,int32,0,0.0,20663,int32,0,0.0
isFraud,1,int8,0,0.0,1,int8,0,0.0
TransactionDT,553717,int32,0,0.0,20638,int32,0,0.0
TransactionAmt,8134,float16,0,0.0,2126,float16,0,0.0
ProductCD,5,object,0,0.0,5,object,0,0.0
card1,13350,int16,0,0.0,1740,int16,0,0.0
card2,500,float16,8510,0.014933,327,float16,423,0.020471
card3,106,float16,1526,0.002678,62,float16,39,0.001887
card4,4,object,1536,0.002695,4,object,41,0.001984
card5,118,float16,4049,0.007105,49,float16,210,0.010163


In [63]:
########################### card4, card6, ProductCD
#################################################################################
# Converting Strings to ints(or floats if nan in column) using frequency encoding
# We will be able to use these columns as category or as numerical feature

for col in ['card4', 'card6', 'ProductCD']:
    print('Encoding', col)
    temp_df = pd.concat([df_train_transaction_transformed[[col]], df_test_transaction_transformed[[col]]])
    col_encoded = temp_df[col].value_counts().to_dict()   
    df_train_transaction_transformed[col] = df_train_transaction_transformed[col].map(col_encoded)
    df_test_transaction_transformed[col]  = df_test_transaction_transformed[col].map(col_encoded)
    print(col_encoded)

Encoding card4
{'visa': 719649, 'mastercard': 347386, 'american express': 16009, 'discover': 9524}
Encoding card6
{'debit': 824959, 'credit': 267648, 'debit or credit': 30, 'charge card': 16}
Encoding ProductCD
{'W': 800657, 'C': 137785, 'R': 73346, 'H': 62397, 'S': 23046}


In [64]:
########################### M columns
#################################################################################
# Converting Strings to ints(or floats if nan in column)

for col in ['M1','M2','M3','M5','M6','M7','M8','M9']:
    df_train_transaction_transformed[col] = df_train_transaction_transformed[col].map({'T':1, 'F':0})
    df_test_transaction_transformed[col]  = df_test_transaction_transformed[col].map({'T':1, 'F':0})

for col in ['M4']:
    print('Encoding', col)
    temp_df = pd.concat([df_train_transaction_transformed[[col]], df_test_transaction_transformed[[col]]])
    col_encoded = temp_df[col].value_counts().to_dict()   
    df_train_transaction_transformed[col] = df_train_transaction_transformed[col].map(col_encoded)
    df_test_transaction_transformed[col]  = df_test_transaction_transformed[col].map(col_encoded)
    print(col_encoded)

Encoding M4
{'M0': 357789, 'M2': 122947, 'M1': 97306}


In [65]:
col = 'id_33'
print(df_train_full[col].dtypes)
print(df_train_full[col].value_counts())
#print(df_train_transaction_transformed[col].dtypes)

object
1920x1080    16874
1366x768      8605
1334x750      6447
2208x1242     4900
1440x900      4384
1600x900      3510
2048x1536     3482
1280x800      2149
2560x1600     2093
2560x1440     1865
2880x1800     1756
1280x1024     1743
1680x1050     1727
1136x640      1712
2436x1125     1484
1280x720      1318
1920x1200     1056
2001x1125     1020
1024x768       824
2220x1080      544
5120x2880      397
3360x2100      380
2732x2048      354
1366x767       335
3840x2160      263
855x480        250
2736x1824      241
1360x768       224
4096x2304      208
2224x1668      194
2220x1081      176
2160x1440      168
2961x1442      157
3200x1800      124
2560x1080      103
2562x1442       99
1600x1200       97
1280x768        90
3000x2000       88
1024x600        88
1280x1025       82
1152x864        79
3440x1440       76
801x480         64
2400x1350       62
960x540         58
2672x1440       58
1919x1079       57
1365x768        52
1919x1080       51
2048x1152       51
2960x1440       47
1920x

In [66]:
########################### Identity columns
#################################################################################

def minify_identity_df(df):

    df['id_12'] = df['id_12'].map({'Found':1, 'NotFound':0})
    df['id_15'] = df['id_15'].map({'New':2, 'Found':1, 'Unknown':0})
    df['id_16'] = df['id_16'].map({'Found':1, 'NotFound':0})

    df['id_23'] = df['id_23'].map({'TRANSPARENT':4, 'IP_PROXY':3, 'IP_PROXY:ANONYMOUS':2, 'IP_PROXY:HIDDEN':1})

    df['id_27'] = df['id_27'].map({'Found':1, 'NotFound':0})
    df['id_28'] = df['id_28'].map({'New':2, 'Found':1})

    df['id_29'] = df['id_29'].map({'Found':1, 'NotFound':0})

    df['id_35'] = df['id_35'].map({'T':1, 'F':0})
    df['id_36'] = df['id_36'].map({'T':1, 'F':0})
    df['id_37'] = df['id_37'].map({'T':1, 'F':0})
    df['id_38'] = df['id_38'].map({'T':1, 'F':0})

    df['id_34'] = df['id_34'].fillna(':0')
    df['id_34'] = df['id_34'].apply(lambda x: x.split(':')[1]).astype(np.int8)
    df['id_34'] = np.where(df['id_34']==0, np.nan, df['id_34'])
    
    df['id_33'] = df['id_33'].fillna('0x0')
    df['id_33_0'] = df['id_33'].apply(lambda x: x.split('x')[0]).astype(int)
    df['id_33_1'] = df['id_33'].apply(lambda x: x.split('x')[1]).astype(int)
    df['id_33'] = np.where(df['id_33']=='0x0', np.nan, df['id_33'])

    df['DeviceType'] = df['DeviceType'].map({'desktop':1, 'mobile':0})
    return df

df_train_identity_transformed = minify_identity_df(df_train_identity_transformed)
df_test_identity_transformed = minify_identity_df(df_test_identity_transformed)

for col in ['id_33']:
    df_train_identity_transformed[col] = df_train_identity_transformed[col].fillna('unseen_before_label')
    df_test_identity_transformed[col]  = df_test_identity_transformed[col].fillna('unseen_before_label')
    
    le = LabelEncoder()
    le.fit(list(df_train_identity_transformed[col])+list(df_test_identity_transformed[col]))
    df_train_identity_transformed[col] = le.transform(df_train_identity_transformed[col])
    df_test_identity_transformed[col]  = le.transform(df_test_identity_transformed[col])
    

In [67]:
df_train_identity_transformed = reduce_mem_usage(df_train_identity_transformed)
df_test_identity_transformed  = reduce_mem_usage(df_test_identity_transformed)

df_train_identity_transformed = reduce_mem_usage(df_train_identity_transformed)
df_test_identity_transformed  = reduce_mem_usage(df_test_identity_transformed)

TransactionID
id_01
id_02
id_03
id_04
id_05
id_06
id_07
id_08
id_09
id_10
id_11
id_12
id_13
id_14
id_15
id_16
id_17
id_18
id_19
id_20
id_21
id_22
id_23
id_24
id_25
id_26
id_27
id_28
id_29
id_30
id_31
id_32
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo
id_33_0
id_33_1
Mem. usage decreased to 14.72 Mb (47.5% reduction)
TransactionID
id_01
id_02
id_03
id_04
id_05
id_06
id_07
id_08
id_09
id_10
id_11
id_12
id_13
id_14
id_15
id_16
id_17
id_18
id_19
id_20
id_21
id_22
id_23
id_24
id_25
id_26
id_27
id_28
id_29
id_30
id_31
id_32
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo
id_33_0
id_33_1
Mem. usage decreased to 14.48 Mb (47.5% reduction)
TransactionID
id_01
id_02
id_03
id_04
id_05
id_06
id_07
id_08
id_09
id_10
id_11
id_12
id_13
id_14
id_15
id_16
id_17
id_18
id_19
id_20
id_21
id_22
id_23
id_24
id_25
id_26
id_27
id_28
id_29
id_30
id_31
id_32
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo
id_33_0
id_33_1
Mem. usage decreased to 14.72 Mb (0.0% reduction)
Transact

In [69]:
%%time
df_train_transaction_transformed.to_pickle('./data/pickles/approach3/df_train_transaction_transformed.pkl')
df_test_transaction_transformed.to_pickle('./data/pickles/approach3/df_test_transaction_transformed.pkl')                                        
       
df_train_identity_transformed.to_pickle('./data/pickles/approach3/df_train_identity_transformed.pkl')
df_test_identity_transformed.to_pickle('./data/pickles/approach3/df_test_identity_transformed.pkl')                                           

CPU times: user 589 ms, sys: 726 ms, total: 1.32 s
Wall time: 1.55 s


In [70]:
print(df_train_identity.shape)
print(df_train_identity_transformed.shape)

(144233, 41)
(144233, 43)


In [71]:
df_train_identity_transformed.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,id_33_0,id_33_1
0,2987004,0.0,70787.0,,,,,,,,,100.0,0,,-480.0,2.0,0.0,166.0,,542.0,144.0,,,,,,,,2.0,0.0,Android 7.0,samsung browser 6.2,32.0,267,2.0,1.0,0.0,1.0,1.0,0.0,SAMSUNG SM-G892A Build/NRD90M,2220,1080
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,,100.0,0,49.0,-300.0,2.0,0.0,166.0,,621.0,500.0,,,,,,,,2.0,0.0,iOS 11.1.2,mobile safari 11.0,32.0,79,1.0,1.0,0.0,0.0,1.0,0.0,iOS Device,1334,750
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,0.0,100.0,0,52.0,,1.0,1.0,121.0,,410.0,142.0,,,,,,,,1.0,1.0,,chrome 62.0,,460,,0.0,0.0,1.0,1.0,1.0,Windows,0,0
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,,100.0,0,52.0,,2.0,0.0,225.0,,176.0,507.0,,,,,,,,2.0,0.0,,chrome 62.0,,460,,0.0,0.0,1.0,1.0,1.0,,0,0
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,0.0,100.0,0,,-300.0,1.0,1.0,166.0,15.0,529.0,575.0,,,,,,,,1.0,1.0,Mac OS X 10_11_6,chrome 62.0,24.0,67,2.0,1.0,0.0,1.0,1.0,1.0,MacOS,1280,800


In [74]:
col = 'id_33'
print(df_train_identity_transformed[col].value_counts())
print(df_train_full[col].value_counts())

460    70950
215    16874
97      8605
79      6447
266     4900
122     4384
155     3510
242     3482
67      2149
315     2093
312     1865
352     1756
56      1743
170     1727
19      1712
296     1484
61      1318
219     1056
233     1020
9        824
267      544
430      397
388      380
335      354
96       335
408      263
448      250
341      241
88       224
416      208
269      194
268      176
260      168
360      157
376      124
308      103
323       99
150       97
65        90
6         88
365       88
57        82
28        79
390       76
447       64
291       62
455       58
327       58
206       57
93        52
207       51
239       51
357       47
229       42
139       42
223       42
70        36
27        32
411       29
250       27
351       26
182       25
133       24
91        19
241       19
55        18
147       18
214       17
293       17
244       17
76        15
7         14
387       14
457       12
154       12
350       12
391       11