### Clean the data
The dataset was originally in two dataframes: transaction table and identity table. Transaction table is about the transaction made including time, place, card info, product/service etc at the transaction. <br>
Identity table is network and digital info associated with transactions.<br>
These two tables are merged by the unique key 'TransactionID' and we use this merged data for training and testing.<br>
#### Procedures
1. Import and split the dataset, check data types
2. Impute missing *numeric* values<br>
    method 1. Impute missing *numeric* values with median<br>
    method 2. Impute missing *numeric* values with Bayesian Ridge Regression from sklearn.linear_model package
3. Impute missing *categorical* values with modes.
4. Merge the two datasets and convert categorical variables into dummies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from ipynb.fs.full.helper_functions import * # Custom function to create dummy variables

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Make better use of Jupyter Notebook cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

#### 1. Import train and test set and split, and check data types

In [2]:
"""

train_identity = pd.read_csv('ieee-fraud-detection/train_identity.csv')
train_transaction = pd.read_csv('ieee-fraud-detection/train_transaction.csv')

df = train_transaction.merge(train_identity, on='TransactionID', how='left')
X, y = df.drop('isFraud', axis=1), df.isFraud

X.loc[:, 'TransactionID'] = X.loc[:, 'TransactionID'].astype('object')
X_train.loc[:, 'card1':'addr2'] = X_train.loc[:, 'card1':'addr2'].astype('object')
X.loc[:, 'id_12':'id_38'] = X.loc[:, 'id_12':'id_38'].astype('object')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, stratify = y, random_state = 1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, stratify = y_test, random_state = 2)

# pd.to_pickle(X_train, 'data/X_train.pkl')
# pd.to_pickle(X_val, 'data/X_val.pkl')
# pd.to_pickle(X_test, 'data/X_test.pkl')
# pd.to_pickle(y_train, 'data/y_train.pkl')
# pd.to_pickle(y_val, 'data/y_val.pkl')
# pd.to_pickle(y_test, 'data/y_test.pkl')

print(f"Num of train set = {len(X_train)}")
print(f"Num of test set(including validation) = {len(X_test)}")

""";

Read from the saved pickle from the second time running this notebook

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = pd.read_pickle('data/X_train.pkl'), pd.read_pickle('data/X_val.pkl'), pd.read_pickle('data/X_test.pkl'), pd.read_pickle('data/y_train.pkl'), pd.read_pickle('data/y_val.pkl'), pd.read_pickle('data/y_test.pkl')

#### 2. Impute missing numeric values on training and validation sets

In [4]:
X_train_numeric = X_train.loc[:, X_train.dtypes != 'object']
X_val_numeric = X_val.loc[:, X_val.dtypes != 'object']

1-1) Impute numeric values with median for training and validation set

In [5]:
imputer_median = X_train_numeric.median()
imp_num_median_train = X_train_numeric.fillna(imputer_median)
imp_num_median_val = X_val_numeric.fillna(imputer_median)

1-2) Or impute numeric values with bayesian ridge regression (takes about 11 hours to fit the model and about 1 hour to transform on "2.5 GHz Quad-Core Intel Core i7   16 GB 1600 MHz DDR3")

In [6]:
"""
imputer_bayesian = IterativeImputer(BayesianRidge())

%time imputer_bayesian.fit(X_train_numeric)
print(f"Imputer has been fit.")
pd.to_pickle(imputer_bayesian, 'imputers/imputer_bayesian_train.pkl')

%time imp_num_bayesian_train = imputer_bayesian.transform(X_train_numeric)
imp_num_bayesian_train = pd.DataFrame(imp_num_bayesian_train, columns = X_train_numeric.columns)

X_val_numeric = X_val.loc[:, X_val.dtypes != 'object']
imp_num_bayesian_val = pd.DataFrame(imputer_bayesian.transform(X_val_numeric), columns = X_val_numeric.columns)

""";

#### 3. Impute missing categorical values on training and validation sets

In [7]:
X_train_cat = X_train.loc[:, X_train.dtypes == 'object']
X_val_cat = X_val.loc[:, X_val.dtypes == 'object']

In [8]:
imputer_mode = X_train_cat.mode(dropna=True).iloc[0,:]
imp_cat_mode_train = X_train_cat.fillna(imputer_mode).astype('object')
imp_cat_mode_val = X_val_cat.fillna(imputer_mode).astype('object')

#### 4. Merge the numeric columns and categorical columns and convert the categorical columns into indicator variables using a custom function

In [9]:
merged_train = pd.concat([imp_num_median_train.reset_index(drop=True), imp_cat_mode_train.reset_index(drop=True)], axis = 1)
merged_val = pd.concat([imp_num_median_val.reset_index(drop=True), imp_cat_mode_val.reset_index(drop=True)], axis = 1)

imp_train = create_dummies(merged_train, merged_train.loc[:, merged_train.dtypes != 'object'].columns)
imp_val = create_dummies(merged_val, merged_val.loc[:, merged_val.dtypes != 'object'].columns)

#### 5. Save the training set and validation set for modeling

In [10]:
pd.to_pickle(imp_train, 'data/imp_train.pkl')
pd.to_pickle(imp_val, 'data/imp_val.pkl')

#### *Normally, we would combine training set and validation set to evaluate test set, but due to calibration and xgboost modeling process, we just impute test data with only training set's medians and modes.*

In [11]:
X_test_numeric = X_test.loc[:, X_test.dtypes != 'object']
imp_num_median_test = X_test_numeric.fillna(imputer_median)
X_test_cat = X_test.loc[:, X_test.dtypes == 'object']
imp_cat_mode_test = X_test_cat.fillna(imputer_mode).astype('object')
merged_test = pd.concat([imp_num_median_test.reset_index(drop=True), imp_cat_mode_test.reset_index(drop=True)], axis = 1)
imp_test = create_dummies(merged_test, merged_test.loc[:, merged_test.dtypes != 'object'].columns)
pd.to_pickle(imp_test, 'data/imp_test.pkl')

In [12]:
# Repeat the same process(step.2 to 5) with training + validation set and the test set
# Combine the training set and test set and impute the means and the modes. Then, apply the same imputed values to the test set.
"""

X_train_val = pd.concat([X_train.reset_index(drop=True), X_val.reset_index(drop=True)], axis = 0).reset_index(drop=True)

X_train_val_numeric = X_train_val.loc[:, X_train_val.dtypes != 'object']
X_test_numeric = X_test.loc[:, X_test.dtypes != 'object']

imputer_median = X_train_val_numeric.median()
imp_num_median_train_val = X_train_val_numeric.fillna(imputer_median)
imp_num_median_test = X_test_numeric.fillna(imputer_median)

X_train_val_cat = X_train_val.loc[:, X_train_val.dtypes == 'object']
X_test_cat = X_test.loc[:, X_test.dtypes == 'object']

imputer_mode = X_train_val_cat.mode(dropna=True).iloc[0,:]
imp_cat_mode_train_val = X_train_val_cat.fillna(imputer_mode).astype('object')
imp_cat_mode_test = X_test_cat.fillna(imputer_mode).astype('object')

merged_train_val = pd.concat([imp_num_median_train_val.reset_index(drop=True), imp_cat_mode_train_val.reset_index(drop=True)], axis = 1)
merged_test = pd.concat([imp_num_median_test.reset_index(drop=True), imp_cat_mode_test.reset_index(drop=True)], axis = 1)

imp_train_val = create_dummies(merged_train_val, merged_train_val.loc[:, merged_train_val.dtypes != 'object'].columns)
imp_test = create_dummies(merged_test, merged_test.loc[:, merged_test.dtypes != 'object'].columns)

# pd.to_pickle(imp_train_val, 'data/imputation_train+val/imp_train_val.pkl')
# pd.to_pickle(imp_test, 'data/imputation_train+val/imp_test.pkl')

""";