In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pylab as plt
from ipdb import set_trace

import utils

### load joined data

In [None]:
X_train, X_test, Y_train, Y_test = utils.load_standard_data(data_type="standard")

# joined dataset

In [None]:
if not os.path.exists(f'{utils._data_pth_}/joined_dataset/'):
    os.mkdir(f'{utils._data_pth_}/joined_dataset/')
    
for (x1, x2, x3) in [('train_transaction.csv', 'train_identity.csv', 'train_joined.csv'), 
                 ('test_transaction.csv', 'test_identity.csv', 'test_joined.csv')]:
    mat1 = pd.read_csv(f'{utils._data_pth_}/{x1}')
    mat2 = pd.read_csv(f'{utils._data_pth_}/{x2}')
    joined_mat = mat1.set_index('TransactionID').join(mat2.set_index('TransactionID'), how='left')
    joined_mat.to_csv(f'{utils._data_pth_}/joined_dataset/{x3}', index=True)

# preprocessed dataset

### convert nominal

In [None]:
def transform_categorical(dat: pd.DataFrame):
    for c in dat.columns:
        # check categorical
        if not len(dat[[c]].std()):
            dat[[c]] = dat[[c]].fillna(dat[[c]].mode()) # fill na with the mode category
            uniques = dict([(v, i) for i, v in enumerate(set(dat[c]))])
            dat[[c]] = dat[[c]].replace(uniques)

### normalize numerical: clip outliers, fill nan, standardize

In [None]:
def normalize(dat: pd.DataFrame):
    for c in dat.columns:
        # check numerical
        if len(dat[[c]].std()):
            dat[[c]] = dat[[c]].fillna(0)
            _mean, _std = dat[[c]].mean(axis=0), dat[[c]].std(axis=0)
            # clip outliers (3 std away)
            lo, hi = _mean - 3*_std, _mean + 3*_std
            dat[[c]] = dat[[c]].clip(lo.item(), hi.item())
            # normalize
            dat[[c]] = (dat[[c]] - _mean) / _std

### preprocess and save

In [None]:
if not os.path.exists(f'{utils._data_pth_}/processed/'):
    os.mkdir(f'{utils._data_pth_}/processed/')
    
for x in ['train_joined.csv', 'test_joined.csv']:
    mat = pd.read_csv(f'{utils._data_pth_}/joined_dataset/{x}')
    # remove ID
    if 'TransactionID' in mat.columns:
        _ids = mat['TransactionID']
        mat.drop(columns=['TransactionID'], inplace=True)
    if 'isFraud' in mat.columns:
        _labels = mat['isFraud']
        mat.drop(columns=['isFraud'], inplace=True)
    transform_categorical(mat)
    normalize(mat)
    mat = pd.concat((_ids, _labels, mat), axis=1)
    mat.to_csv(f'{utils._data_pth_}/processed/{x}', index=False)
    del mat

# missing value dataset

In [None]:
def cnt_missing(df):
    total_missing = 0
    for column_name in df.columns:
        total_missing += (df[column_name] == 0.0).sum()
    print(total_missing)
    print("Missing value percent before modification: ",total_missing/(df.shape[0]*df.shape[1]))
        
for (x1,x2) in [('train_joined.csv','train_joined_missing_value.csv'), ('test_joined.csv','test_joined_missing_value.csv')]:
    mat = pd.read_csv(f'{utils._data_pth_}/processed/{x1}')
    if 'TransactionID' in mat.columns:
        _ids = mat['TransactionID']
        mat.drop(columns=['TransactionID'], inplace=True)
    if 'isFraud' in mat.columns:
        _labels = mat['isFraud']
        mat.drop(columns=['isFraud'], inplace=True)

    cnt_missing(mat)
    # modify the data
    mask = np.random.choice([True, False], size=mat.shape, p=[.3,.7])
    mat.mask(mask, other=0.0, inplace=True)
    mat.to_csv(f'{utils._data_pth_}/processed/{x2}', index=False)

### test missing value percent

In [None]:
for (x1,x2) in [('train_joined.csv','train_joined_missing_value.csv'), ('test_joined.csv','test_joined_missing_value.csv')]:
    df = pd.read_csv(f'{utils._data_pth_}/processed/{x2}')
    cnt_missing(df)
    del df

### Covariance analysis 

In [None]:
from sklearn.covariance import empirical_covariance as emp_cov

cov_mat = emp_cov(X_train)

In [None]:
from matplotlib import pyplot as plt

cov_mat = pd.DataFrame(cov_mat)
def view_cov(cov_mat, q=0.75):
    distr = cov_mat.quantile(q=q, axis=0)
    ranked_covs = sorted(distr, reverse=True)
    fig, ax = plt.subplots()
    plt.hist(ranked_covs, bins=len(ranked_covs))
    plt.xlabel(f"{q*100} percentile covariance", fontsize="xx-large")
    plt.ylabel("count", fontsize="xx-large")
    plt.show()
    
view_cov(cov_mat, q=0.75)
view_cov(cov_mat, q=0.25)

### PCA outlook

# redundant value dataset

In [None]:
from numpy.random import random, randint
from sklearn.decomposition import PCA

try:
    del X_train
    del Y_train
    del X_test
    del Y_test
except NameError:
    pass

pca = PCA(n_components=10)

def add_augment(df, pcs, dup=0):
    for i in range(pcs.shape[1]):
        col = pcs[:, i]
        std = col.std()
        col = col * (0.5 + random()) + std * (random() - 0.5)
        df.insert(randint(0, df.shape[1]), f"{dup}_PC{i}", col)

for (x1,x2) in [('train_joined.csv','train_joined_redundant_value.csv')]:
    mat = pd.read_csv(f'{utils._data_pth_}/processed/{x1}')
    
    if 'TransactionID' in mat.columns:
        _ids = mat['TransactionID']
        mat.drop(columns=['TransactionID'], inplace=True)
    if 'isFraud' in mat.columns:
        _labels = mat['isFraud']
        mat.drop(columns=['isFraud'], inplace=True)
        
    pcs = pca.fit_transform(mat.values)
    for i in range(15):    
        add_augment(mat, pcs, dup=i)
    mat = pd.concat((_ids, _labels, mat), axis=1)
    mat.to_csv(f'{utils._data_pth_}/processed/{x2}', index=False)

    

# undersample dataset

In [None]:
for (x1,x2) in [('train_joined.csv','train_joined_undersampled_value.csv')]:
    mat = pd.read_csv(f'{utils._data_pth_}/processed/{x1}')
    
    pos, neg = mat[mat.isFraud == 1], mat[mat.isFraud == 0]
    neg = neg.sample(n=len(pos), random_state=utils._random_seed_)
    under_mat = pd.concat((pos, neg), axis=0)
    under_mat = under_mat.sample(frac=1)
    set_trace()
    under_mat.to_csv(f'{utils._data_pth_}/processed/{x2}', index=False)
