In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pylab as plt
# from ipdb import set_trace

import utils

### Join two datasets

In [None]:
if not os.path.exists(f'{utils._data_pth_}/joined_dataset/'):
    os.mkdir(f'{utils._data_pth_}/joined_dataset/')
    
for (x1, x2, x3) in [('train_transaction.csv', 'train_identity.csv', 'train_joined.csv'), 
                 ('test_transaction.csv', 'test_identity.csv', 'test_joined.csv')]:
    if not os.path.exists(f'{utils._data_pth_}/joined_dataset/{x3}'):
        
        mat1 = pd.read_csv(f'{utils._data_pth_}/{x1}')
        mat2 = pd.read_csv(f'{utils._data_pth_}/{x2}')
        joined_mat = mat1.set_index('TransactionID').join(mat2.set_index('TransactionID'), how='left')
        joined_mat.to_csv(f'{utils._data_pth_}/joined_dataset/{x3}', index=True)

### convert nominal

In [None]:
def transform_categorical(dat: pd.DataFrame):
    for c in dat.columns:
        # check categorical
        if not len(dat[[c]].std()):
            dat[[c]] = dat[[c]].fillna(dat[[c]].mode()) # fill na with the mode category
            uniques = dict([(v, i) for i, v in enumerate(set(dat[c]))])
            dat[[c]] = dat[[c]].replace(uniques)

### normalize numerical: clip outliers, fill nan, standardize

In [None]:
def normalize(dat: pd.DataFrame):
    for c in dat.columns:
        # check numerical
        if len(dat[[c]].std()):
            dat[[c]] = dat[[c]].fillna(0)
            _mean, _std = dat[[c]].mean(axis=0), dat[[c]].std(axis=0)
            # clip outliers (3 std away)
            lo, hi = _mean - 3*_std, _mean + 3*_std
            dat[[c]] = dat[[c]].clip(lo.item(), hi.item())
            # normalize
            dat[[c]] = (dat[[c]] - _mean) / _std

### preprocess and save

In [None]:
if not os.path.exists(f'{utils._data_pth_}/processed/'):
    os.mkdir(f'{utils._data_pth_}/processed/')
    
for x in ['train_joined.csv', 'test_joined.csv']:
    if not os.path.exists(f'{utils._data_pth_}/processed/{x}'):
        
        mat = pd.read_csv(f'{utils._data_pth_}/joined_dataset/{x}')
        # remove ID
        if 'TransactionID' in mat.columns:
            _ids = mat['TransactionID']
            mat.drop(columns=['TransactionID'], inplace=True)
        if 'isFraud' in mat.columns:
            _labels = mat['isFraud']
            mat.drop(columns=['isFraud'], inplace=True)
        transform_categorical(mat)
        normalize(mat)
        mat = pd.concat((_ids, _labels, mat), axis=1)
#         set_trace()
        mat.to_csv(f'{utils._data_pth_}/processed/{x}', index=False)
        del mat

In [None]:
mat = pd.read_csv(f'{utils._data_pth_}/train_transaction.csv')

In [None]:
mat.info()