In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',None)
train_identity = pd.read_csv('Data/train_identity.csv')
train_transaction = pd.read_csv('Data/train_transaction.csv')

In [10]:
#Separating identity data into strings and numerics
identity_categorical = train_identity.select_dtypes(include='object')
identity_numerics = train_identity.select_dtypes(exclude='object')

#Getting dummies and joining back toegether with numerics
identity_dummies = pd.get_dummies(identity_categorical, dummy_na = True)
train_identity = pd.concat([identity_numerics, identity_dummies], axis=1)

#Separating transaction data into strings and numerics
transaction_categorical = train_transaction.select_dtypes(include='object')
transaction_numerics = train_transaction.select_dtypes(exclude='object')

#Getting dummies and joining back toegether with numerics
transaction_dummies = pd.get_dummies(transaction_categorical, dummy_na = True)
train_transaction = pd.concat([transaction_numerics, transaction_dummies], axis=1)

In this notebook, I'm going to play around with an alternative way of filling the NaNs. Using the quantile approach seems to be too computationally burdensome. I'll try use a workaround by randomly shuffling the rows of the dataframe, using the forward fill method and repeating until all NaNs have been removed. I believe this will provide a faster way to randomly fill data.

In [11]:
import time

t0 = time.time()
na_count = train_transaction.isna().sum().sum()
while na_count>0:
    train_transaction = train_transaction.sample(frac=1)
    train_transaction = train_transaction.fillna(method='ffill',limit=10)
    na_count = train_transaction.isna().sum().sum()

train_transaction = train_transaction.sort_values(by=['TransactionID'])

t1 = time.time()

print(t1-t0)

109.07085609436035


In [12]:
corrs = train_transaction.iloc[:,2:].corrwith(train_transaction['isFraud'])
print(abs(corrs).sort_values(ascending=False))

V86                             0.206479
V87                             0.205757
V45                             0.199827
V44                             0.184848
V52                             0.170547
ProductCD_C                     0.161442
V79                             0.158742
V51                             0.158580
V94                             0.154554
V33                             0.154109
V17                             0.153773
card3                           0.153724
V18                             0.153644
V74                             0.153253
V34                             0.151538
V40                             0.150856
R_emaildomain_gmail.com         0.149974
V15                             0.149480
V81                             0.148887
V93                             0.147949
V92                             0.147882
V80                             0.147722
V16                             0.146446
M4_M2                           0.143940
V39             

In [8]:
print(abs(corrs).sort_values(ascending=False))

V86                             0.210023
V87                             0.207766
V45                             0.197757
V44                             0.183268
V52                             0.169442
ProductCD_C                     0.161442
V79                             0.158838
V51                             0.158107
V94                             0.154913
V33                             0.154563
V74                             0.154125
card3                           0.153794
V17                             0.153655
V18                             0.153491
V34                             0.151994
V81                             0.150182
R_emaildomain_gmail.com         0.149974
V40                             0.149932
V15                             0.149577
V80                             0.148586
V92                             0.148316
V93                             0.148240
V16                             0.146455
V73                             0.144206
M4_M2           

In [43]:
t0 = time.time()

train_transaction = train_transaction.sample(frac=1)
train_transaction = train_transaction.fillna(method='ffill',limit=10)
na_count = train_transaction.isna().sum().sum()

t1 = time.time()

print(na_count,t1-t0)

0 34.564347982406616


In [None]:
any(train_transaction.isna())

In [47]:
na_count

0