# Overall Statsitical Properties

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import os.path
from functools import reduce


In [28]:
pd.options.display.max_rows = 1000


In [29]:
DATA_DIR='../data'

In [30]:
train_samp = pd.read_pickle(os.path.join(DATA_DIR,'interim','train_reduced.pkl')).sample(frac=0.50, random_state=13)

In [31]:
train_samp.shape

(295270, 434)

In [32]:
train_samp = train_samp.drop(['TransactionID'],axis=1)

## Split numeric and character variables

In [33]:
char_predictors = [col for col in train_samp.columns if train_samp[col].dtype.name == 'object']

In [34]:
num_predictors = [col for col in train_samp.columns if train_samp[col].dtype.name != 'object']

## Dataframe for numeric perdictors

In [35]:
num_df = train_samp[num_predictors]
num_df.shape

(295270, 402)

In [36]:
def calc_pct_missing(df):
    return df.isnull().sum()/df.shape[0] * 100

def calc_overall_stats(df):
    desc = df.describe().T
    skew = pd.DataFrame(df.skew(), columns=['skew'])
    nunique = pd.DataFrame(df.nunique(), columns=['nunique'])
    pct_missing = pd.DataFrame(df.isnull().sum()/train_samp.shape[0] * 100, columns=['pct_missing']) 
    dtype = pd.DataFrame(df.dtypes, columns=['dtype'])

    return reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), 
          [dtype, nunique, pct_missing, skew, desc])
    
def calc_isFraud_stats(df):
    desc = df.groupby('isFraud').describe().stack(0).swaplevel(0,1).sort_index()
    skew = pd.DataFrame(df.groupby('isFraud').skew().stack(0).swaplevel(0,1).sort_index(), columns=['skew'])
    nunique = pd.DataFrame(df.groupby('isFraud').nunique().stack(0).swaplevel(0,1).sort_index(), columns=['nunique'])

    return reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), 
          [ nunique, skew, desc])[['nunique','skew', 'count', 'mean', 'std','min', '25%', '50%', '75%', 'max']]
# pct_missing,

In [37]:
numeric_stats = calc_overall_stats(num_df)
numeric_stats

Unnamed: 0,dtype,nunique,pct_missing,skew,count,mean,std,min,25%,50%,75%,max
isFraud,uint8,2,0.0,5.046689,295270.0,0.0351712,0.1842127,0.0,0.0,0.0,0.0,1.0
TransactionDT,uint32,290920,0.0,0.134336,295270.0,7365475.0,4618203.0,86401.0,3025518.0,7265135.5,11243660.0,15811130.0
TransactionAmt,float32,14437,0.0,14.264346,295270.0,134.9959,239.8896,0.251,43.261,68.5,125.0,31937.39
card1,uint16,11221,0.0,-0.037667,295270.0,9881.126,4901.541,1001.0,6019.0,9633.0,14170.0,18395.0
card2,uint16,501,0.0,-0.183831,295270.0,358.6557,159.8724,99.0,204.0,360.0,512.0,600.0
card3,uint8,105,0.0,1.624558,295270.0,153.077,11.69561,99.0,150.0,150.0,150.0,231.0
card5,uint8,105,0.0,-1.200781,295270.0,198.5546,41.94752,99.0,166.0,226.0,226.0,237.0
addr1,uint16,253,0.0,0.273546,295270.0,269.1642,113.304,99.0,184.0,272.0,327.0,540.0
addr2,uint8,63,0.0,-2.431662,295270.0,78.10898,24.63737,9.0,87.0,87.0,87.0,102.0
dist1,uint16,2358,0.0,-0.39708,295270.0,39193.42,32083.95,0.0,13.0,65535.0,65535.0,65535.0


In [38]:
numeric_stats_by_isFraud = calc_isFraud_stats(num_df)
numeric_stats_by_isFraud

Unnamed: 0_level_0,Unnamed: 1_level_0,nunique,skew,count,mean,std,min,25%,50%,75%,max
Unnamed: 0_level_1,isFraud,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C1,0,1034,24.695639,284885.0,13.30356,128.1389,0.0,1.0,1.0,3.0,4685.0
C1,1,373,13.637097,10385.0,35.08801,234.989,0.0,1.0,2.0,6.0,4682.0
C10,0,769,25.990862,284885.0,4.778465,91.92431,0.0,0.0,0.0,0.0,3257.0
C10,1,177,13.982958,10385.0,18.92961,170.8753,0.0,0.0,1.0,2.0,3254.0
C11,0,953,22.997689,284885.0,9.738379,90.56159,0.0,1.0,1.0,2.0,3188.0
C11,1,255,13.081791,10385.0,23.27578,165.0705,0.0,1.0,2.0,4.0,3186.0
C12,0,608,28.273214,284885.0,3.617597,83.16613,0.0,0.0,0.0,0.0,3188.0
C12,1,215,14.796937,10385.0,18.12932,156.0667,0.0,0.0,0.0,2.0,3186.0
C13,0,1285,8.73556,284885.0,32.71815,127.9859,0.0,1.0,3.0,13.0,2918.0
C13,1,281,11.948761,10385.0,24.08888,160.8442,0.0,1.0,1.0,6.0,2915.0


In [39]:
df.sort_values(['std'])

Unnamed: 0,dtype,nunique,pct_missing,skew,count,mean,std,min,25%,50%,75%,max
V305,uint8,3,0.0,0.0,88581.0,1.0,0.00475168,0.0,1.0,1.0,1.0,2.0
C3,uint8,10,0.0,81.545347,88581.0,0.005565528,0.1332275,0.0,0.0,0.0,0.0,19.0
isFraud,uint8,2,0.0,5.06626,88581.0,0.03492848,0.1835997,0.0,0.0,0.0,0.0,1.0
D9,float32,25,0.0,2.416471,88581.0,-0.8008797,0.5336168,-1.0,-1.0,-1.0,-1.0,0.958333
id_24,float64,10,75.54103,9.264696,21666.0,10.08968,0.6448415,10.0,10.0,10.0,10.0,24.0
V290,uint8,35,0.0,34.509578,88581.0,1.102516,0.7311108,0.0,1.0,1.0,1.0,66.0
V286,uint8,9,0.0,277.161994,88581.0,0.03405922,0.8773397,0.0,0.0,0.0,0.0,255.0
V284,uint8,12,0.0,237.736048,88581.0,0.09289803,0.9234688,0.0,0.0,0.0,0.0,255.0
V302,uint8,13,0.0,196.096556,88581.0,0.2557095,0.9842226,0.0,0.0,0.0,0.0,255.0
V304,uint8,14,0.0,182.630122,88581.0,0.2683307,1.008101,0.0,0.0,0.0,0.0,255.0


## Dataframe for character predictors

In [40]:
char_df = train_samp[['isFraud'] + char_predictors]
char_df.head()

Unnamed: 0,isFraud,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,...,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
57903,0,W,visa,credit,aol.com,,,,,M0,...,,,,,,,,,,
400901,0,W,visa,debit,yahoo.com,,T,T,T,,...,,,,,,,,,,
235736,0,W,visa,credit,,,,,,,...,,,,,,,,,,
200779,0,W,visa,debit,gmail.com,,,,,,...,,,,,,,,,,
226515,0,W,visa,debit,sbcglobal.net,,,,,,...,,,,,,,,,,


In [41]:
def describe_char_var(df):
    
    count = pd.DataFrame(df.count(), columns=['count'])
    nunique = pd.DataFrame(df.nunique(), columns=['nunique'])
    pct_missing = pd.DataFrame(df.isnull().sum()/df.shape[0] * 100, columns=['pct_missing'])
    
    value_dict = dict()
    for c in char_df.columns:
        vc = char_df[c].value_counts()
        ll = list(zip(vc.index,vc))
        value_dict.update({c:ll[:5]})
        
    top_values = pd.DataFrame(pd.Series(value_dict), columns=['top_values'])
    
    return  reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), 
          [ count, nunique, pct_missing, top_values])

In [42]:
char_stats= describe_char_var(char_df)
char_stats

Unnamed: 0,count,nunique,pct_missing,top_values
isFraud,295270,2,0.0,"[(0, 284885), (1, 10385)]"
ProductCD,295270,5,0.0,"[(W, 219630), (C, 34413), (R, 18883), (H, 1659..."
card4,294439,4,0.281437,"[(visa, 192440), (mastercard, 94467), (america..."
card6,294446,4,0.279067,"[(debit, 219908), (credit, 74521), (debit or c..."
P_emaildomain,248173,59,15.950486,"[(gmail.com, 114045), (yahoo.com, 50521), (hot..."
R_emaildomain,68826,60,76.690487,"[(gmail.com, 28638), (hotmail.com, 13865), (an..."
M1,159467,2,45.99282,"[(T, 159457), (F, 10)]"
M2,159467,2,45.99282,"[(T, 142433), (F, 17034)]"
M3,159467,2,45.99282,"[(T, 125536), (F, 33931)]"
M4,154723,3,47.599485,"[(M0, 98322), (M2, 30079), (M1, 26322)]"
