In this notebook I will use permutation importance to identify important numeric features to aggregate in the feature-creation notebook. 

It will also help inform as to whether or not there might be any benefit in dropping any variables.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import os 
import time

os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
train_transaction = pd.read_csv('Data/train_transaction.csv')
fraud = train_transaction['isFraud']
train_transaction.drop('isFraud',axis=1,inplace=True)

In [3]:
#Creating function to deal with NAs by shuffling and forward filling.

def ffill(df):
    
    t0 = time.time()
    
    na_count = df.isna().sum().sum()
    while na_count>0:
        df = df.sample(frac=1)
        df = df.fillna(method='ffill',limit=10)
        na_count = df.isna().sum().sum()

    
    df = df.sort_index()
    t1 = time.time()

    return(df)
    print(t1-t0)

In [4]:
numerics = train_transaction.select_dtypes(exclude='object')
numerics = ffill(numerics)

In [9]:
numerics_train, numerics_test, fraud_train, fraud_test = train_test_split(numerics, fraud, test_size=0.8)

numerics_val, numerics_test, fraud_val, fraud_test = train_test_split(numerics_test, fraud_test, test_size = 0.75)

print(numerics_train.shape, numerics_val.shape)

(118108, 379) (118108, 379)


In [16]:
model = xgb.XGBClassifier(
    learning_rate = 0.2,
    n_estimators = 300,
    max_depth = 10,
    objective = 'binary:logistic'
)

eval_set = [(numerics_val, fraud_val)]

model.fit(
    numerics_train, 
    fraud_train,
    early_stopping_rounds = 5,
    eval_set = eval_set,
    eval_metric = 'auc'
)

[0]	validation_0-auc:0.824952
Will train until validation_0-auc hasn't improved in 5 rounds.
[1]	validation_0-auc:0.842145
[2]	validation_0-auc:0.849876
[3]	validation_0-auc:0.858019
[4]	validation_0-auc:0.859138
[5]	validation_0-auc:0.861019
[6]	validation_0-auc:0.864642
[7]	validation_0-auc:0.86633
[8]	validation_0-auc:0.867723
[9]	validation_0-auc:0.870634
[10]	validation_0-auc:0.874254
[11]	validation_0-auc:0.875376
[12]	validation_0-auc:0.877162
[13]	validation_0-auc:0.880175
[14]	validation_0-auc:0.881749
[15]	validation_0-auc:0.883307
[16]	validation_0-auc:0.88484
[17]	validation_0-auc:0.886631
[18]	validation_0-auc:0.889888
[19]	validation_0-auc:0.891647
[20]	validation_0-auc:0.893236
[21]	validation_0-auc:0.896534
[22]	validation_0-auc:0.897545
[23]	validation_0-auc:0.898049
[24]	validation_0-auc:0.898689
[25]	validation_0-auc:0.899445
[26]	validation_0-auc:0.900222
[27]	validation_0-auc:0.901336
[28]	validation_0-auc:0.902412
[29]	validation_0-auc:0.903271
[30]	validation_0-a

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.2,
       max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
       n_estimators=300, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [13]:
import eli5
from eli5.sklearn import PermutationImportance

In [28]:
perm_numerics = PermutationImportance(model, random_state=1).fit(numerics_val.iloc[:50000,:], fraud_val.iloc[:50000])
perm_numerics_df = eli5.explain_weights_df(perm_numerics, feature_names = numerics_val.columns.tolist())
perm_numerics_df['standardised_weight'] = perm_numerics_df['weight']/perm_numerics_df['std']
perm_numerics_df.sort_values('standardised_weight', ascending=False)

Unnamed: 0,feature,weight,std,standardised_weight
92,V255,0.000020,0.000000,inf
38,V67,0.000100,0.000000,inf
91,V289,0.000020,0.000000,inf
94,V39,0.000020,0.000000,inf
90,V115,0.000020,0.000000,inf
93,V319,0.000020,0.000000,inf
89,V7,0.000020,0.000000,inf
9,TransactionID,0.000920,0.000025,36.366193
16,V62,0.000348,0.000010,35.517601
25,V82,0.000208,0.000010,21.228911


In [30]:
perm_numerics_df['standardised_weight'] = perm_numerics_df['weight']/perm_numerics_df['std']
out_df = perm_numerics_df.sort_values('standardised_weight', ascending=False)
out_df.to_csv('Data/numerics_rankings.csv',index=False) 