# Initial EDA

In [1]:
import pandas as pd
import numpy as np
import os
import os.path

In [6]:
pd.options.display.max_rows = 4000


In [7]:
DATA_DIR = '../data'

In [101]:
train_df = pd.read_pickle(os.path.join(DATA_DIR, 'interim', 'train_reduced.pkl')).sample(frac=0.25,random_state=13)
train_df.shape

(147635, 434)

In [102]:
train_df['isFraud'].describe()

count    147635.000000
mean          0.035093
std           0.184016
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: isFraud, dtype: float64

In [103]:
train_df['TransactionDT'].describe()

count    1.476350e+05
mean     7.366157e+06
std      4.615224e+06
min      8.646900e+04
25%      3.021557e+06
50%      7.271472e+06
75%      1.124650e+07
max      1.581109e+07
Name: TransactionDT, dtype: float64

In [104]:
# get only predictor variables
predictors = list(set(train_df.columns) - set(['TransactionID', 'isFraud']))

In [105]:
predictors

['V135',
 'V35',
 'V280',
 'V180',
 'V142',
 'V289',
 'V287',
 'id_04',
 'V330',
 'V190',
 'V246',
 'V184',
 'V332',
 'V179',
 'V5',
 'D2',
 'D9',
 'V239',
 'V107',
 'V309',
 'V200',
 'D10',
 'card4',
 'V335',
 'id_12',
 'id_29',
 'V95',
 'V265',
 'V315',
 'V113',
 'V105',
 'V133',
 'id_20',
 'V278',
 'V247',
 'V96',
 'V3',
 'V290',
 'id_19',
 'V204',
 'id_06',
 'V78',
 'V80',
 'V212',
 'C13',
 'V115',
 'V136',
 'V236',
 'V205',
 'V132',
 'V6',
 'V41',
 'V243',
 'V277',
 'V21',
 'V70',
 'C1',
 'V163',
 'id_18',
 'D4',
 'V206',
 'V276',
 'V166',
 'V26',
 'V116',
 'V270',
 'V92',
 'id_37',
 'V221',
 'V312',
 'V83',
 'V14',
 'V37',
 'V174',
 'id_35',
 'V102',
 'V338',
 'C14',
 'V97',
 'V225',
 'addr1',
 'V144',
 'D1',
 'id_34',
 'V264',
 'V120',
 'V127',
 'V295',
 'V297',
 'V161',
 'V321',
 'V13',
 'C5',
 'V258',
 'V328',
 'M3',
 'V146',
 'V20',
 'V333',
 'V109',
 'V248',
 'V210',
 'V262',
 'V84',
 'V52',
 'V294',
 'V275',
 'M8',
 'V54',
 'V55',
 'id_16',
 'V44',
 'D12',
 'V233',
 'V324',

In [106]:
float_predictors = [(x, train_df[x].dtype.name) for x in predictors if train_df[x].dtype.name[:5] == 'float']

In [107]:
train_df.dtypes

TransactionID      uint32
isFraud             uint8
TransactionDT      uint32
TransactionAmt    float32
ProductCD          object
card1              uint16
card2              uint16
card3               uint8
card4              object
card5               uint8
card6              object
addr1              uint16
addr2               uint8
dist1              uint16
dist2              uint16
P_emaildomain      object
R_emaildomain      object
C1                 uint16
C2                 uint16
C3                  uint8
C4                 uint16
C5                 uint16
C6                 uint16
C7                 uint16
C8                 uint16
C9                  uint8
C10                uint16
C11                uint16
C12                uint16
C13                uint16
C14                uint16
D1                 uint16
D2                 uint16
D3                 uint16
D4                  int16
D5                 uint16
D6                  int16
D7                 uint16
D8          

In [108]:
df = train_df.iloc[:5,:5]

In [109]:
df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD
57903,3044903,0,1349089,108.5,W
400901,3387901,0,10106096,312.950012,W
235736,3222736,0,5586574,490.119995,W
200779,3187779,0,4570886,17.950001,W
226515,3213515,0,5351581,114.949997,W


In [110]:
df['new_var'] = 123

In [111]:
df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,new_var
57903,3044903,0,1349089,108.5,W,123
400901,3387901,0,10106096,312.950012,W,123
235736,3222736,0,5586574,490.119995,W,123
200779,3187779,0,4570886,17.950001,W,123
226515,3213515,0,5351581,114.949997,W,123


In [112]:
def extract_df(col):
    df = train_df[['isFraud']].copy()
    df['value'] = train_df[col].copy()
    df['var_name'] = col
    return df

In [113]:
ll= [extract_df(col[0]) for col in float_predictors]

In [114]:
df = pd.concat(ll,ignore_index=True)

In [115]:
df.shape

(15058770, 3)

In [120]:
df[df.var_name == 'V135']

Unnamed: 0,isFraud,value,var_name
0,0,0.000000,V135
1,0,0.000000,V135
2,0,0.000000,V135
3,0,61.849998,V135
4,0,0.000000,V135
5,0,0.000000,V135
6,0,0.000000,V135
7,0,0.000000,V135
8,0,0.000000,V135
9,0,0.000000,V135


In [119]:
float_predictors

[('V135', 'float32'),
 ('id_04', 'float64'),
 ('V332', 'float32'),
 ('D9', 'float32'),
 ('V309', 'float32'),
 ('V335', 'float32'),
 ('V265', 'float32'),
 ('V315', 'float32'),
 ('V133', 'float32'),
 ('id_20', 'float64'),
 ('V278', 'float32'),
 ('id_19', 'float64'),
 ('V204', 'float32'),
 ('id_06', 'float64'),
 ('V212', 'float32'),
 ('V136', 'float32'),
 ('V205', 'float32'),
 ('V132', 'float32'),
 ('V277', 'float32'),
 ('V163', 'float32'),
 ('id_18', 'float64'),
 ('V206', 'float32'),
 ('V276', 'float32'),
 ('V166', 'float32'),
 ('V270', 'float32'),
 ('V312', 'float32'),
 ('V338', 'float32'),
 ('V264', 'float32'),
 ('V127', 'float32'),
 ('V161', 'float32'),
 ('V321', 'float32'),
 ('V333', 'float32'),
 ('V210', 'float32'),
 ('V275', 'float32'),
 ('V311', 'float32'),
 ('V331', 'float32'),
 ('id_03', 'float64'),
 ('id_05', 'float64'),
 ('V129', 'float32'),
 ('V137', 'float32'),
 ('V318', 'float32'),
 ('V165', 'float32'),
 ('id_21', 'float64'),
 ('V339', 'float32'),
 ('id_14', 'float64'),
 ('