# Initial EDA

In [1]:
import pandas as pd
import numpy as np
import os
import os.path
import sys
import seaborn as sns
import mlflow.tracking
import tempfile
import scipy.stats as stats

In [2]:
sys.path.append('..')
from utils.kaggle import get_global_parameters, calc_contest_metric
from utils.preprocessing import encode_mean_level
global_parms = get_global_parameters()


## Retrieve mlflow artifact for sample data set

In [3]:
client = mlflow.tracking.MlflowClient()

# Run Id for sample data
RUN_ID='4b20d2c58c6f44beb36528a1542f9551'

tmpdir = tempfile.mkdtemp()

client.download_artifacts(RUN_ID,'.',tmpdir)

train_df = pd.read_pickle(os.path.join(tmpdir,'sample.pkl'))
train_df.shape

(118108, 434)

In [4]:
# overall fraud rate
train_df['isFraud'].mean()

0.03501879635587767

## Partition predictors as numeric and categorical

In [5]:
# separate numeric vs categorical attributes
predictors = list(set(train_df.columns) - set(['isFraud','TransactionID']))

cat_predictors = sorted([col for col in predictors if train_df[col].dtype == 'object'])
num_predictors = sorted(list(set(predictors) - set(cat_predictors)))

In [6]:
cat_predictors

['DeviceInfo',
 'DeviceType',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
 'card4',
 'card6',
 'id_12',
 'id_15',
 'id_16',
 'id_23',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38']

In [7]:
num_predictors

['C1',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'D1',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'TransactionAmt',
 'TransactionDT',
 'V1',
 'V10',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V11',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V12',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V126',
 'V127',
 'V128',
 'V129',
 'V13',
 'V130',
 'V131',
 'V132',
 'V133',
 'V134',
 'V135',
 'V136',
 'V137',
 'V138',
 'V139',
 'V14',
 'V140',
 'V141',
 'V142',
 'V143',
 'V144',
 'V145',
 'V146',
 'V147',
 'V148',
 'V149',
 'V15',
 'V150',
 'V151',
 'V152',
 'V153',
 'V154',
 'V155',
 'V156',
 'V157',
 'V158',
 'V159',
 'V16',
 'V160',
 'V161',
 'V162',
 'V163',
 'V164',
 'V165',
 'V166',
 'V167',
 'V168',
 'V169',
 'V17',
 'V170',
 'V171',
 'V172',
 'V173',
 'V174',
 'V

In [8]:
def summarize_categorical(cat_df):
    result_list = []
    for col in cat_df.columns:
        df = pd.concat([cat_df[col], train_df['isFraud']], axis = 1)
        result1 = df.groupby(col)['isFraud'].mean()
        multi_index = pd.MultiIndex.from_tuples([(result1.index.name, v) for v in result1.index], names=["var", "level"])
        result1.index = multi_index

        result2 = df.groupby(col)['isFraud'].count()
        multi_index = pd.MultiIndex.from_tuples([(result2.index.name, v) for v in result2.index], names=["var", "level"])
        result2.index = multi_index

        results = pd.DataFrame([result1, result2]).T
        results.columns = ['isFraud', 'count']

        result_list.append(results)

    return pd.DataFrame(pd.concat(result_list))

In [9]:
cat_df = train_df[cat_predictors].copy()

In [10]:
summarize_categorical(cat_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,isFraud,count
var,level,Unnamed: 2_level_1,Unnamed: 3_level_1
DeviceInfo,0PAJ5,0.000000,1.0
DeviceInfo,1016S,0.000000,1.0
DeviceInfo,2PS64 Build/NRD90M,0.000000,1.0
DeviceInfo,2PZC5,0.000000,1.0
DeviceInfo,4003A,0.000000,1.0
DeviceInfo,4013M Build/KOT49H,0.000000,2.0
DeviceInfo,4034G,0.000000,1.0
DeviceInfo,4047G Build/NRD90M,0.000000,4.0
DeviceInfo,5010G Build/MRA58K,0.000000,13.0
DeviceInfo,5010S Build/MRA58K,0.000000,3.0


In [11]:
cat_df.fillna('__NA__', inplace=True)

In [12]:
cat_df.head()

Unnamed: 0,DeviceInfo,DeviceType,M1,M2,M3,M4,M5,M6,M7,M8,...,id_28,id_29,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38
57903,__NA__,__NA__,__NA__,__NA__,__NA__,M0,T,F,__NA__,__NA__,...,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__
400901,__NA__,__NA__,T,T,T,__NA__,__NA__,F,F,F,...,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__
235736,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,F,F,F,...,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__
200779,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,F,__NA__,__NA__,...,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__
226515,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,T,__NA__,__NA__,...,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__,__NA__


## Get count of unique values, including missing value '\_\_NA\_\_'

In [13]:
pd.Series([cat_df[c].nunique() for c in cat_df.columns], index=cat_df.columns)

DeviceInfo       1091
DeviceType          3
M1                  3
M2                  3
M3                  3
M4                  4
M5                  3
M6                  3
M7                  3
M8                  3
M9                  3
P_emaildomain      60
ProductCD           5
R_emaildomain      60
card4               5
card6               5
id_12               3
id_15               4
id_16               3
id_23               4
id_27               3
id_28               3
id_29               3
id_30              73
id_31             113
id_33             154
id_34               4
id_35               3
id_36               3
id_37               3
id_38               3
dtype: int64

## Combine low-count categorical levels

In [14]:
MAX_CATEGORICAL_LEVELS=100
TOP_CATEGORICAL_LEVELS=10
cat2_df = pd.DataFrame()

for c in cat_predictors:

    # get one column of categorical variables
    all_cat = cat_df[c].copy()
    
    # determine number of unique levels
    number_of_levels = len(all_cat.unique())
    print('Predictor: ',c,' levels ',number_of_levels)
    if number_of_levels > MAX_CATEGORICAL_LEVELS:
        print("    By passing")
        continue
    
    # handle situation where number of unique levels exceed threshold
    if number_of_levels > TOP_CATEGORICAL_LEVELS:
        counts_by_level = all_cat.value_counts()
        
        # get level values for those not in the top ranks
        low_count_levels = counts_by_level.index[TOP_CATEGORICAL_LEVELS:]
    
        # eliminate NULL value if present
        levels_to_other = [x for x in low_count_levels if len(x)>0]
        
        # set less frequent levels to special valid value 
        idx = [x in set(levels_to_other)for x in all_cat]
        all_cat.loc[idx] = '__OTHER__'
        
            
    # impute special value for any missing values
    idx = [ isinstance(x,float) for x in all_cat]
    all_cat.loc[idx] = '__N/A__'
    all_cat.name=c
    cat2_df[c] = all_cat

Predictor:  DeviceInfo  levels  1091
    By passing
Predictor:  DeviceType  levels  3
Predictor:  M1  levels  3
Predictor:  M2  levels  3
Predictor:  M3  levels  3
Predictor:  M4  levels  4
Predictor:  M5  levels  3
Predictor:  M6  levels  3
Predictor:  M7  levels  3
Predictor:  M8  levels  3
Predictor:  M9  levels  3
Predictor:  P_emaildomain  levels  60
Predictor:  ProductCD  levels  5
Predictor:  R_emaildomain  levels  60
Predictor:  card4  levels  5
Predictor:  card6  levels  5
Predictor:  id_12  levels  3
Predictor:  id_15  levels  4
Predictor:  id_16  levels  3
Predictor:  id_23  levels  4
Predictor:  id_27  levels  3
Predictor:  id_28  levels  3
Predictor:  id_29  levels  3
Predictor:  id_30  levels  73
Predictor:  id_31  levels  113
    By passing
Predictor:  id_33  levels  154
    By passing
Predictor:  id_34  levels  4
Predictor:  id_35  levels  3
Predictor:  id_36  levels  3
Predictor:  id_37  levels  3
Predictor:  id_38  levels  3


In [15]:
cat2_df['P_emaildomain'].value_counts()

gmail.com        45683
yahoo.com        20183
__NA__           18828
hotmail.com       9062
anonymous.com     7347
__OTHER__         6615
aol.com           5724
comcast.net       1542
icloud.com        1287
outlook.com        993
att.net            844
Name: P_emaildomain, dtype: int64

In [16]:
pd.options.display.max_rows = 1000


In [17]:
summarize_categorical(cat2_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,isFraud,count
var,level,Unnamed: 2_level_1,Unnamed: 3_level_1
DeviceType,__NA__,0.021678,89998.0
DeviceType,desktop,0.063273,16990.0
DeviceType,mobile,0.09982,11120.0
M1,F,0.0,3.0
M1,T,0.020404,63763.0
M1,__NA__,0.05217,54342.0
M2,F,0.033908,6842.0
M2,T,0.018779,56924.0
M2,__NA__,0.05217,54342.0
M3,F,0.032129,13446.0


## Try mean-level encoding

In [18]:
from sklearn.model_selection import StratifiedKFold

In [19]:
df, test_mean_level_mapping = encode_mean_level(cat2_df, train_df['isFraud'])
df2 = pd.concat([train_df['isFraud'],df], axis=1)
df2.head(10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat(ll1, axis=1).mean(axis=1)


Unnamed: 0,isFraud,DeviceType_ml,M1_ml,M2_ml,M3_ml,M4_ml,M5_ml,M6_ml,M7_ml,M8_ml,...,id_23_ml,id_27_ml,id_28_ml,id_29_ml,id_30_ml,id_34_ml,id_35_ml,id_36_ml,id_37_ml,id_38_ml
2,0,0.021576,0.020399,0.018751,0.017376,0.036848,0.026486,0.023781,0.020901,0.023864,...,0.034626,0.034626,0.021567,0.021567,0.033405,0.033356,0.021553,0.021553,0.021553,0.021553
6,0,0.021576,0.020399,0.018751,0.017376,0.036848,0.026486,0.023781,0.01893,0.015049,...,0.034626,0.034626,0.021567,0.021567,0.033405,0.033356,0.021553,0.021553,0.021553,0.021553
20,0,0.021709,0.020302,0.018742,0.017286,0.037568,0.027055,0.02452,0.020379,0.023281,...,0.03453,0.03453,0.021685,0.021685,0.033639,0.033537,0.021672,0.021672,0.021672,0.021672
24,0,0.021709,0.052271,0.052271,0.052271,0.018723,0.03681,0.02452,0.045406,0.045406,...,0.03453,0.03453,0.021685,0.021685,0.033639,0.033537,0.021672,0.021672,0.021672,0.021672
25,0,0.021743,0.052153,0.052153,0.052153,0.018871,0.036887,0.024596,0.045147,0.045147,...,0.034599,0.034599,0.021721,0.021721,0.033558,0.033445,0.021722,0.021722,0.021722,0.021722
30,0,0.021576,0.052122,0.052122,0.052122,0.019236,0.037277,0.023781,0.045068,0.045068,...,0.034626,0.034626,0.021567,0.021567,0.033405,0.033356,0.021553,0.021553,0.021553,0.021553
34,0,0.021743,0.020401,0.018824,0.017052,0.018871,0.036887,0.017399,0.02054,0.023382,...,0.034599,0.034599,0.021721,0.021721,0.033558,0.033445,0.021722,0.021722,0.021722,0.021722
41,0,0.021674,0.020234,0.018772,0.031582,0.036765,0.039121,0.024589,0.045354,0.045354,...,0.034615,0.034615,0.021651,0.021651,0.033616,0.033564,0.021638,0.021638,0.021638,0.021638
52,0,0.021576,0.020399,0.034172,0.031758,0.036848,0.026486,0.018019,0.045068,0.045068,...,0.034626,0.034626,0.021567,0.021567,0.033405,0.033356,0.021553,0.021553,0.021553,0.021553
57,0,0.100752,0.052122,0.052122,0.052122,0.019236,0.037277,0.069476,0.045068,0.045068,...,0.034626,0.034626,0.050772,0.049943,0.04661,0.038799,0.045665,0.08006,0.080969,0.060114


In [20]:
X = df2.drop(['isFraud', 'card6_ml'], axis=1).copy()
y = df2['isFraud']

In [21]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.2, random_state=13, shuffle=True,
                                                   stratify=y)


In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:

rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=37)
rf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=37, verbose=0, warm_start=False)

In [24]:
pred_proba = rf.predict_proba(test_X)

In [25]:
calc_contest_metric(test_y, pred_proba[:,1])

0.724095925701265

In [26]:
df2.columns

Index(['isFraud', 'DeviceType_ml', 'M1_ml', 'M2_ml', 'M3_ml', 'M4_ml', 'M5_ml',
       'M6_ml', 'M7_ml', 'M8_ml', 'M9_ml', 'P_emaildomain_ml', 'ProductCD_ml',
       'R_emaildomain_ml', 'card4_ml', 'card6_ml', 'id_12_ml', 'id_15_ml',
       'id_16_ml', 'id_23_ml', 'id_27_ml', 'id_28_ml', 'id_29_ml', 'id_30_ml',
       'id_34_ml', 'id_35_ml', 'id_36_ml', 'id_37_ml', 'id_38_ml'],
      dtype='object')

In [27]:
list(zip(X.columns,rf.feature_importances_))

[('DeviceType_ml', 0.031333846141666365),
 ('M1_ml', 0.003372764025436983),
 ('M2_ml', 0.013635390525196103),
 ('M3_ml', 0.013654834931222215),
 ('M4_ml', 0.07141379912756071),
 ('M5_ml', 0.01879745471469331),
 ('M6_ml', 0.03686813336194605),
 ('M7_ml', 0.014749527995616834),
 ('M8_ml', 0.01582474096126253),
 ('M9_ml', 0.015665925441246),
 ('P_emaildomain_ml', 0.20734132387156767),
 ('ProductCD_ml', 0.0391975048996522),
 ('R_emaildomain_ml', 0.09762212258410352),
 ('card4_ml', 0.11640429219892627),
 ('id_12_ml', 0.03226413499587967),
 ('id_15_ml', 0.020308031813055714),
 ('id_16_ml', 0.018111751349169453),
 ('id_23_ml', 0.014310938762632019),
 ('id_27_ml', 0.01027652794323194),
 ('id_28_ml', 0.014321048597051975),
 ('id_29_ml', 0.018250822603813843),
 ('id_30_ml', 0.05975410361892401),
 ('id_34_ml', 0.02421743399789362),
 ('id_35_ml', 0.01732078516259026),
 ('id_36_ml', 0.01496886632693325),
 ('id_37_ml', 0.024165622676984872),
 ('id_38_ml', 0.03584827137174323)]

In [28]:
var_imp = pd.DataFrame(list(zip(X.columns,rf.feature_importances_)))
var_imp.columns = ['var', 'importance']

In [29]:
var_imp.sort_values(['importance'], ascending=[False])

Unnamed: 0,var,importance
10,P_emaildomain_ml,0.207341
13,card4_ml,0.116404
12,R_emaildomain_ml,0.097622
4,M4_ml,0.071414
21,id_30_ml,0.059754
11,ProductCD_ml,0.039198
6,M6_ml,0.036868
26,id_38_ml,0.035848
14,id_12_ml,0.032264
0,DeviceType_ml,0.031334


In [30]:
test_mean_level_mapping

{'DeviceType': {'__NA__': 0.0216782684499747,
  'desktop': 0.0632734547219568,
  'mobile': 0.09982020877530554},
 'M1': {'F': 0.0, 'T': 0.0204036769090933, '__NA__': 0.052169642174424304},
 'M2': {'F': 0.033908109246380866,
  'T': 0.01877941603870687,
  '__NA__': 0.052169642174424304},
 'M3': {'F': 0.032127478880118285,
  'T': 0.0172695036313017,
  '__NA__': 0.052169642174424304},
 'M4': {'M0': 0.0371781945145729,
  'M1': 0.026860830550202018,
  'M2': 0.11075886150944314,
  '__NA__': 0.018878776262239234},
 'M5': {'F': 0.02659500598297742,
  'T': 0.038953378309035135,
  '__NA__': 0.03702908202463019},
 'M6': {'F': 0.024431815067927715,
  'T': 0.017494834902512767,
  '__NA__': 0.06925468427497854},
 'M7': {'F': 0.020615896957300782,
  'T': 0.019475382649189205,
  '__NA__': 0.0452039950335705},
 'M8': {'F': 0.02346349837436928,
  'T': 0.015273108034389563,
  '__NA__': 0.0452039950335705},
 'M9': {'F': 0.03143666534922142,
  'T': 0.018409410525389667,
  '__NA__': 0.0452039950335705},
 'P_