In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pylab as plt
from ipdb import set_trace


_data_pth_ = os.path.expanduser('~/ieee-fraud-detection')

### process nominal

In [2]:
def transform_categorical(dat: pd.DataFrame):
    for c in dat.columns:
        # check categorical
        if not len(dat[[c]].std()):
            dat[[c]] = dat[[c]].fillna(dat[[c]].mode()) # fill na with the mode category
            uniques = dict([(v, i) for i, v in enumerate(set(dat[c]))])
            dat[[c]] = dat[[c]].replace(uniques)

### normalize numerical: remove ID, clip outliers, fill nan, standardize

In [3]:
def normalize(dat: pd.DataFrame):
    if 'TransactionID' in dat.columns:
        dat.drop(columns=['TransactionID'], inplace=True)
    for c in dat.columns:
        # check numerical
        if len(dat[[c]].std()):
            dat[[c]] = dat[[c]].fillna(0)
            _mean, _std = dat[[c]].mean(axis=0), dat[[c]].std(axis=0)
            # clip outliers (3 std away)
            lo, hi = _mean - 3*_std, _mean + 3*_std
            dat[[c]] = dat[[c]].clip(lo.item(), hi.item())
            # normalize
            dat[[c]] = (dat[[c]] - _mean) / _std

### preprocess and save

In [6]:
if not os.path.exists(f'{_data_pth_}/processed/'):
    os.mkdir(f'{_data_pth_}/processed/')
    
for x in ['train_transaction.csv', 'test_identity.csv', 'test_transaction.csv', 'train_identity.csv']:
    if not os.path.exists(f'{_data_pth_}/processed/{x}'):
        mat = pd.read_csv(f'{_data_pth_}/{x}')
        transform_categorical(mat)
        normalize(mat)
        mat.to_csv(f'{_data_pth_}/processed/{x}', index=False)
        del mat
    


In [None]:
%debug

In [None]:
_data_pth_