# Introduction

This notebook does some preprocessing on the data in `sheet 1` and then splits it into a training and validation set.

# Setup

Navigate to the root folder:

In [1]:
%cd ..

/tf


# Libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Data

In [3]:
data_file = "./data/FraudDetection_Dataset.xlsx"

df_unknown = pd.read_excel(data_file,
                           sheet_name="Data")

df_fraud = pd.read_excel(data_file,
                         sheet_name="KnownFraud")

df_test = pd.read_excel(data_file,
                        sheet_name="Test")

In [4]:
df_unknown.head()

Unnamed: 0,CustomerID,Behavior1,Behavior2,Behavior3,Behavior4,Behavior5,Behavior6,Behavior7,Behavior8,Behavior9,Behavior10,Behavior11
0,1,6.653114,13.689702,17.552845,-11.933497,-5.756409,15.435921,-12.136752,8.224123,-4.884024,5.349335,17.351372
1,2,-5.630542,-18.937667,18.644624,-6.155825,-25.988864,15.409553,-16.595516,9.454654,-2.877173,-6.00942,19.210636
2,3,9.721525,-9.982489,15.571434,-32.224266,-5.595799,24.321383,0.832774,10.794555,0.727575,10.392975,7.085995
3,4,-4.635766,-10.71604,20.356571,-9.002097,-4.920569,-4.261958,-1.305829,0.94998,-8.634107,0.198445,0.006771
4,5,-12.841044,-20.388313,9.491936,-7.0385,-9.921087,-3.716105,-9.38473,-2.325534,-13.518562,11.007029,14.00321


In [5]:
df_fraud.head()

Unnamed: 0,CustomerID,Behavior1,Behavior2,Behavior3,Behavior4,Behavior5,Behavior6,Behavior7,Behavior8,Behavior9,Behavior10,Behavior11,Marker
0,1001,18.271283,-12.359268,5.00097,1.611472,1.088592,29.31441,-8.313531,36.438338,-9.476679,-25.9282,7.108628,FRAUD
1,1002,12.953176,-10.350333,19.857858,-24.424608,3.266105,30.44529,-11.125842,12.815629,9.351355,-6.11362,17.62738,FRAUD
2,1003,16.564435,15.684168,17.543187,-12.700243,0.537667,9.147008,-8.376931,20.664013,6.553167,35.36919,-14.3517,FRAUD
3,1004,6.717709,-31.671866,14.436564,-19.513431,-9.096539,-12.313,-19.86087,26.672534,9.316382,-9.75309,-0.32457,FRAUD
4,1005,28.0862,-12.64082,23.794977,-0.02583,-5.80931,-0.60488,-3.439889,6.395132,-5.536944,13.93349,4.453282,FRAUD


In [6]:
df_test.head()

Unnamed: 0,CustomerID,Behavior1,Behavior2,Behavior3,Behavior4,Behavior5,Behavior6,Behavior7,Behavior8,Behavior9,Behavior10,Behavior11,Marker
0,2001,-3.19307,-19.705821,6.528556,-21.570858,3.050556,23.477503,-1.13342,12.688409,-10.968301,-1.198666,1.908441,
1,2002,-5.256678,-12.91872,25.435141,-7.231568,3.647838,12.36655,-6.512076,8.750902,-0.768755,-3.261094,13.729474,
2,2003,3.763819,-5.728187,11.400169,-17.555297,-6.246576,16.155969,-9.724758,16.996616,-5.168654,0.187994,11.401234,
3,2004,-8.701132,-5.139959,21.4042,-18.08427,-9.109825,6.903168,-3.486003,0.273855,-7.548916,9.057031,2.890973,
4,2005,2.278052,-11.677949,21.587127,-24.207612,0.506218,-5.678402,14.149213,-3.416211,-1.982982,16.541657,4.778478,


# Split

Split `df_unknown` into train and validation set:

In [7]:
df_train, df_valid = train_test_split(df_unknown, test_size=0.2, random_state=123)

# Preprocessing

Define the feature columns:

In [8]:
feature_cols = [f"Behavior{i}" for i in range(1, 12)]
feature_cols

['Behavior1',
 'Behavior2',
 'Behavior3',
 'Behavior4',
 'Behavior5',
 'Behavior6',
 'Behavior7',
 'Behavior8',
 'Behavior9',
 'Behavior10',
 'Behavior11']

Standardize the feature columns for all datasets:

In [9]:
scaler = StandardScaler()
scaler.fit(df_train[feature_cols])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [10]:
train_feat = scaler.transform(df_train[feature_cols])
valid_feat = scaler.transform(df_valid[feature_cols])
test_feat = scaler.transform(df_test[feature_cols])
fraud_feat = scaler.transform(df_fraud[feature_cols])

In [11]:
df_train_feat = pd.DataFrame.from_records(train_feat, columns=feature_cols)
df_valid_feat = pd.DataFrame.from_records(valid_feat, columns=feature_cols)
df_test_feat  = pd.DataFrame.from_records(test_feat, columns=feature_cols)
df_fraud_feat = pd.DataFrame.from_records(fraud_feat, columns=feature_cols)

Add the customer id to `df_test_feat`:

In [12]:
df_test_feat['CustomerID'] = df_test['CustomerID']
df_test_feat.head()

Unnamed: 0,Behavior1,Behavior2,Behavior3,Behavior4,Behavior5,Behavior6,Behavior7,Behavior8,Behavior9,Behavior10,Behavior11,CustomerID
0,-1.04955,-1.397937,-0.989136,-1.213002,1.343242,1.406939,0.651082,0.461592,-0.868932,-0.41005,-0.92228,2001
1,-1.312116,-0.458093,1.524306,0.355186,1.417673,0.224037,-0.066818,0.042204,0.980246,-0.657391,0.740002,2002
2,-0.164379,0.537615,-0.341504,-0.773848,0.184664,0.627469,-0.495621,0.920462,0.182544,-0.243752,0.412604,2003
3,-1.750377,0.61907,0.988432,-0.831698,-0.172145,-0.35761,0.337078,-0.860693,-0.248997,0.819886,-0.784115,2004
4,-0.353423,-0.286278,1.012751,-1.501365,1.026175,-1.697078,2.690885,-1.253725,0.760107,1.717496,-0.518693,2005


Save the results:

In [13]:
df_train_feat.to_parquet('./data/train_feat.parquet')
df_valid_feat.to_parquet('./data/valid_feat.parquet')
df_test_feat.to_parquet('./data/test_feat.parquet')
df_fraud_feat.to_parquet('./data/fraud_feat.parquet')