In this notebook I will use permutation importance to identify important numeric features to aggregate in the feature-creation notebook. 

It will also help inform as to whether or not there might be any benefit in dropping any variables.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import os 
import time

os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
train_transaction = pd.read_csv('Data/train_transaction.csv')
fraud = train_transaction['isFraud']
train_transaction.drop(['isFraud','TransactionID'],axis=1,inplace=True)

In [3]:
#Creating function to deal with NAs by shuffling and forward filling.

def ffill(df):
    
    t0 = time.time()
    
    na_count = df.isna().sum().sum()
    while na_count>0:
        df = df.sample(frac=1)
        df = df.fillna(method='ffill',limit=10)
        na_count = df.isna().sum().sum()

    
    df = df.sort_index()
    t1 = time.time()

    return(df)
    print(t1-t0)

In [4]:
numerics = train_transaction.select_dtypes(exclude='object')
numerics = ffill(numerics)

In [5]:
numerics_train, numerics_test, fraud_train, fraud_test = train_test_split(numerics, fraud, test_size=0.8)

numerics_val, numerics_test, fraud_val, fraud_test = train_test_split(numerics_test, fraud_test, test_size = 0.75)

print(numerics_train.shape, numerics_val.shape)

(118108, 378) (118108, 378)


In [6]:
model = xgb.XGBClassifier(
    learning_rate = 0.2,
    n_estimators = 300,
    max_depth = 10,
    objective = 'binary:logistic'
)

eval_set = [(numerics_val, fraud_val)]

model.fit(
    numerics_train, 
    fraud_train,
    early_stopping_rounds = 5,
    eval_set = eval_set,
    eval_metric = 'auc'
)

[0]	validation_0-auc:0.816559
Will train until validation_0-auc hasn't improved in 5 rounds.
[1]	validation_0-auc:0.84244
[2]	validation_0-auc:0.849993
[3]	validation_0-auc:0.854918
[4]	validation_0-auc:0.860353
[5]	validation_0-auc:0.862264
[6]	validation_0-auc:0.865374
[7]	validation_0-auc:0.868906
[8]	validation_0-auc:0.870958
[9]	validation_0-auc:0.872244
[10]	validation_0-auc:0.876194
[11]	validation_0-auc:0.878211
[12]	validation_0-auc:0.880306
[13]	validation_0-auc:0.882946
[14]	validation_0-auc:0.886636
[15]	validation_0-auc:0.888655
[16]	validation_0-auc:0.891289
[17]	validation_0-auc:0.892409
[18]	validation_0-auc:0.895087
[19]	validation_0-auc:0.895738
[20]	validation_0-auc:0.897704
[21]	validation_0-auc:0.898366
[22]	validation_0-auc:0.900254
[23]	validation_0-auc:0.901409
[24]	validation_0-auc:0.90231
[25]	validation_0-auc:0.902834
[26]	validation_0-auc:0.902876
[27]	validation_0-auc:0.903888
[28]	validation_0-auc:0.904745
[29]	validation_0-auc:0.904516
[30]	validation_0-a

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.2,
       max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
       n_estimators=300, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [7]:
import eli5
from eli5.sklearn import PermutationImportance

In [8]:
perm_numerics = PermutationImportance(model, random_state=1).fit(numerics_val.iloc[:100000,:], fraud_val.iloc[:100000])
perm_numerics_df = eli5.explain_weights_df(perm_numerics, feature_names = numerics_val.columns.tolist())
perm_numerics_df['standardised_weight'] = perm_numerics_df['weight']/perm_numerics_df['std']
perm_numerics_df.sort_values('standardised_weight', ascending=False)

Unnamed: 0,feature,weight,std,standardised_weight
131,V163,0.000010,0.000000,inf
132,V108,0.000010,0.000000,inf
133,V278,0.000010,0.000000,inf
134,V188,0.000010,0.000000,inf
135,V153,0.000010,0.000000,inf
136,V63,0.000010,0.000000,inf
137,V198,0.000010,0.000000,inf
138,V129,0.000010,0.000000,inf
139,V260,0.000010,0.000000,inf
140,V333,0.000010,0.000000,inf


In [9]:
perm_numerics_df['standardised_weight'] = perm_numerics_df['weight']/perm_numerics_df['std']
out_df = perm_numerics_df.sort_values('standardised_weight', ascending=False)
out_df.to_csv('Data/numerics_rankings.csv',index=False) 