# Overview

In this competition, we benchmark machine learning models on a challenging large-scale dataset. The data comes from Vesta's real-world e-commerce transactions and contains a wide range of features from device type to product features. We also have the opportunity to create new features to improve your results.

See the [README](README.md) for further details

## Approach

In this notebook, we leverage a custom API, [JLpyUtils](https://pypi.org/project/JLpyUtils/), the author ([John Leonard](https://www.linkedin.com/in/johntleonard/)) has developed to streamline exploritory data analysis, feature engineering, and model selection tasks. Furthermore, we heavily utilize the [dask](https://dask.org), as it is much more efficient at managing large datasets such as those used in this analysis. To further improve our memory resource management, we often use python manual garbage collection function ```gc.collect()``` to clear out deleted objects from memory.

# Install Libs

In [None]:
!pip install JLpyUtils==0.2.9
!pip install tensorflow==1.14.0
!pip install tensorflow-gpu==1.14.0
!pip install dask_ml

import IPython.display
IPython.display.clear_output()

# Import Libs

In [None]:
import numpy as np
import pandas as pd
import sys, os, importlib, gc
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings

import dask
import dask_xgboost
import dask_ml, dask_ml.model_selection

In [None]:
pd.options.display.max_columns = 1000
mpl.rcParams['font.size']=14

In [None]:
dev = True

In [None]:
path_desktop = '/mr3_boltprod_john_t_leonard/Data_Science_Projects.'
if dev:
    print('Running in dev mode. Using local copy of JLpyUtils')
    path_dev_repo = os.path.join(path_desktop,'JLpyUtils')
    sys.path.insert(0, path_dev_repo)

import JLpyUtils

In [None]:
JLpyUtils.__version__

# Download Raw Data

This competion uses train test sets that are >100 mb, which is githubs standard limit, thus we don't store the dataset directly in the repo.. Below, we download the data from kaggle URL specified at the competition home page

In [None]:
path_raw_data_dir = 'ieee-fraud-detection'

In [None]:
# from kaggle.api.kaggle_api_extended import KaggleApi

# api = KaggleApi()
# api.authenticate()

# files = api.competition_download_files("ieee-fraud-detection", path = path_raw_data_dir)

# import zipfile
# for file in os.listdir(path_raw_data_dir):
#     if 'zip' in file:
#         with zipfile.ZipFile(os.path.join(path_raw_data_dir, file) , 'r') as zip_ref:
#             zip_ref.extractall(path_raw_data_dir)
#         os.remove(os.path.join(path_raw_data_dir, file))

# Load Data

In [None]:
def load_df(path_raw_data_dir,
            train_test_id = 'train'):
    """
    Load the train or test df by left joining the transaction data with the identity data on the 'TransactionID' header
    """
    import gc 
    
    import dask.dataframe as dd
    
    df_transaction = dd.read_csv(os.path.join( path_raw_data_dir, train_test_id+'_transaction.csv'))
    df_identity = dd.read_csv(os.path.join( path_raw_data_dir, train_test_id+'_identity.csv'))
    
    df = dd.merge(df_transaction, df_identity, how='left', on = 'TransactionID')



    
    del df_transaction, df_identity
    gc.collect()
    return df

df = load_df(path_raw_data_dir, train_test_id = 'train')
display(df.info())
display(df.head(), df.shape)

# Define Feature and Label headers

In [None]:
def fetch_headers_dict():
    headers_dict = {'labels':['isFraud'],
                    'UID':'TransactionID'}
    headers_dict['features'] = list(df.drop(columns=headers_dict['labels']+ [headers_dict['UID']]).columns)
    headers_dict['categorical features'] = ['ProductCD']+[header for header in headers_dict['features'] if 'card' in header or 'addr' in header or 'emaildomain' in header or 'M' in header or 'Device' in header and header != headers_dict['UID']] + ['id_'+str(int_) for int_ in range(12,39)]
    headers_dict['continuous features'] = [feature for feature in headers_dict['features'] if feature not in headers_dict['categorical features']]
    return headers_dict

headers_dict = fetch_headers_dict()
for key in headers_dict.keys():
    print('\n',key,':', headers_dict[key])

# Define X and Y

In [None]:
X = df[headers_dict['features']]
y = df[headers_dict['labels']]

# Basic Feature Cleaning

In [None]:
def basic_feat_cleaner(X):
    import warnings
    warnings.filterwarnings('ignore')
    
    X['M1'] = X['M1'].fillna('F')
    
    warnings.filterwarnings('default')
    return X

X = basic_feat_cleaner(X)

# Plot Subset of Data

In [None]:
X = X.head(1000)
y = y.head(1000)

In [None]:
JLpyUtils.plot.hist_or_bar(X, categorical_headers= headers_dict['categorical features'])

In [None]:
y.head()

In [None]:
JLpyUtils.plot.hist_or_bar(y, categorical_headers= ['isFraud'], n_plot_columns=1)

# Feature Engineering

## Load the Training Set

In [None]:
importlib.reload(JLpyUtils)
importlib.reload(JLpyUtils.file_utils)
importlib.reload(JLpyUtils.ML)
importlib.reload(JLpyUtils.ML.preprocessing)
importlib.reload(JLpyUtils.ML.preprocessing.LabelEncode)
importlib.reload(JLpyUtils.ML.preprocessing.Scale)
importlib.reload(JLpyUtils.ML.preprocessing.Impute)
importlib.reload(JLpyUtils.ML.preprocessing.OneHotEncode)

df = load_df(path_raw_data_dir, train_test_id = 'train')

In [None]:
#slice out subset for code dev
df = df.partitions[0]

In [None]:
X = df[headers_dict['features']]
y = df[headers_dict['labels']]

X = basic_feat_cleaner(X)

display(X.head())

del df
gc.collect()
None

## Instantiate The Feature Engineering Pipe

The ```JLpyUtils.ML.preprocessing.feat_eng_pipe``` class is designed to streamline & automate running various feature engineering operations. The feature engineering sequence is:
1. LabelEncode.categorical_features
2. Scale.continuous_features
    * for Scaler_ID in Scalers_dict.keys()
3. Impute.categorical_features
    * for Imputer_cat_ID in Imputer_categorical_dict[Imputer_cat_ID].keys():<br>
        *for Imputer_iter_class_ID in Imputer_categorical_dict[Imputer_cat_ID].keys():
4. Imputer.continuous_features
    * for Imputer_cont_ID in Imputer_continuous_dict.keys():
        * for Imputer_iter_reg_ID in Imputer_continuous_dict[Imputer_cont_ID].keys():
5. OneHotEncode
6. CorrCoeffThreshold

For many of these operations, there are various hyperparameters that could be varied to perform similar but different types of feature engineering. The default settings in the class are setup to allow one to ultimately perform model training on data sets that have different types of scaling, or imputation, etc.. applied. However, for simplicy, and because this dataset is quite large, we will just focus on one feature engineering pipe scenario

In [None]:
feat_eng_pipe = JLpyUtils.ML.preprocessing.feat_eng_pipe(path_report_dir = path_desktop, 
                                                              verbose=1, 
                                                              overwrite=False)

## Define Single Feature engineering case to evaluate

We could iterate through all possible feature engineering scenarios, but this dataset is quite large and that would take quite a bit of time, so we will just evaluate one promising scenario

In [None]:
import sklearn.preprocessing
import sklearn.linear_model

feat_eng_pipe.Scalers_dict = {'MinMaxScaler': sklearn.preprocessing.MinMaxScaler()}
feat_eng_pipe.Imputer_categorical_dict = {'most_frequent': {None: None}}
feat_eng_pipe.Imputer_continuous_dict = {'median':{None:None}}#{ 'iterative': {'BayesianRidge': sklearn.linear_model.BayesianRidge()}}
feat_eng_pipe.OneHot_cases = [True]
feat_eng_pipe.AbsCorrCoeff_thresholds = [1]

## Fit Feat Eng Pipe on X

In [None]:
feat_eng_pipe.fit(X, headers_dict=headers_dict, format_='csv')

del X
gc.collect()
None

## Transform X_field using Feat Eng Pipe

We call Kaggle's "test" data "field" data, since usually you actually have labels in the "test" data for machine learning problems, and since we don't actually have access to the labels, it's kinda more like testing our model on field data and getting feedback later on have good or bad it did.

In [None]:
df_field = load_df(path_raw_data_dir, train_test_id = 'test')
X_field = df_field[headers_dict['features']]

X_field = basic_feat_cleaner(X_field)

del df_field
gc.collect()
None

In [None]:
if dev: #slice out 1 partition of the data for development
    X_field = X_field.partitions[1]

In [None]:
feat_eng_pipe.transform(X_field)

del X_field
gc.collect()
None

# Model Selection

## Load The Necessary Input Data

In [None]:
#for feat_eng_case_dir in feat_eng_pipe.path_feat_eng_dirs:
feat_eng_case_dir = os.path.join(feat_eng_pipe.path_feat_eng_root_dir, 
                                 'LabelEncode/Scaler_ID[MinMaxScaler]',
                                 'Imputer_categorical_ID[most_frequent]',
                                 'Imputer_iterator_classifier_ID[None]',
                                 'Imputer_continuous_ID[median]',
                                 'Imputer_iterator_regressor_ID[None]',
                                 'OneHot_case[True]',
                                 'CorrCoeffThreshold[1]')
feat_eng_case_dir

In [None]:
headers_dict = JLpyUtils.file_utils.load('headers_dict','json',feat_eng_case_dir)
headers_dict.keys()

In [None]:
X = JLpyUtils.file_utils.load('X','csv', feat_eng_case_dir, headers=headers_dict['headers after CorrCoeffThreshold'])

#ensure column headers of formatting appropriate for xgboost
columns_reformatted = [col.replace('[','(').replace(']',')').replace('<','less') for col in X.columns]
X.columns = columns_reformatted

display(X.head())

# ensure X & y have consistant partitions
y = y.repartition(npartitions=X.npartitions)
display(y.head())

## Train Test Split

In [None]:
Xy = X.merge(y, left_index = True, right_index=True)

In [None]:
Xy_train, Xy_test = Xy.random_split([0.7, 0.3],
                                random_state=0)

In [None]:
X_train = Xy_train[X.columns]
X_test = Xy_test[X.columns]

y_train = Xy_train[y.columns]
y_test = Xy_test[y.columns]

## Class Balance

In [None]:
# to be added at later time

## Train the Models

In [None]:
warnings.filterwarnings('ignore')

#try shutting down client to ensure you don't start 2 clients
try:
    client.close()
except:
    None

#start client server
client = dask.distributed.Client()
display(client)b

params = {'objective': 'binary:logistic',
          'max_depth': 4, 'eta': 0.01, 'subsample': 0.5,
          'min_child_weight': 0.5}

#run training & close client if something goes wrong.
try:
     model = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=1)
    client.close()
except Exception as e:
    client.close()
    raise e
    
warnings.filterwarnings('default')

In [None]:
model.get_fscore()

In [None]:
ax = xgboost.plot_importance(model)
ax.grid(which='both', visible=False)
plt.show()