# 4. Preprocessing and Training Data Development

## 4.1 Imports

In [56]:
# Import numpy, pandas, matplotlib.pyplot, and seaborn and os.
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os

## 4.2 Load previously processed data

In [57]:
# Intermediate data was saved as '../data/baf_after_eda.csv.gz' previously.
baf_data = pd.read_csv('../data/baf_after_eda.csv.gz', compression='gzip')

In [58]:
# Validate data types and non-null values.
baf_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993563 entries, 0 to 993562
Data columns (total 32 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   fraud_bool                        993563 non-null  int64  
 1   email_is_free                     993563 non-null  int64  
 2   phone_home_valid                  993563 non-null  int64  
 3   phone_mobile_valid                993563 non-null  int64  
 4   has_other_cards                   993563 non-null  int64  
 5   foreign_request                   993563 non-null  int64  
 6   keep_alive_session                993563 non-null  int64  
 7   payment_type                      993563 non-null  object 
 8   employment_status                 993563 non-null  object 
 9   housing_status                    993563 non-null  object 
 10  source                            993563 non-null  object 
 11  device_os                         993563 non-null  o

In [59]:
# Get categorical features list
categorical_list = list(baf_data.columns)[0:13]
print(categorical_list)

['fraud_bool', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'keep_alive_session', 'payment_type', 'employment_status', 'housing_status', 'source', 'device_os', 'intended_balcon_amount_negative']


## 4.3 Create dummy features

In [60]:
# Check categorical feature unique value counts
baf_data[categorical_list].describe()

Unnamed: 0,fraud_bool,email_is_free,phone_home_valid,phone_mobile_valid,has_other_cards,foreign_request,keep_alive_session,intended_balcon_amount_negative
count,993563.0,993563.0,993563.0,993563.0,993563.0,993563.0,993563.0,993563.0
mean,0.011066,0.529484,0.417285,0.889641,0.223341,0.025271,0.577733,0.741533
std,0.104613,0.49913,0.493111,0.313337,0.416485,0.156946,0.493921,0.437792
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
75%,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


We can see that `payment_type`, `employment_status`, `housing_status`, `source` and `device_os` have string values and >= 2 unique values. We can convert them into dummy features to prepare for modeling.

In [61]:
# Create dummy features
dummy_list = list(baf_data.columns)[7:12]
print(dummy_list)
dummy = pd.get_dummies(baf_data[dummy_list], dtype='int64')
dummy.head()

['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']


Unnamed: 0,payment_type_AA,payment_type_AB,payment_type_AC,payment_type_AD,payment_type_AE,employment_status_CA,employment_status_CB,employment_status_CC,employment_status_CD,employment_status_CE,...,housing_status_BE,housing_status_BF,housing_status_BG,source_INTERNET,source_TELEAPP,device_os_linux,device_os_macintosh,device_os_other,device_os_windows,device_os_x11
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,1,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [62]:
# Concatenate dummy features with the rest of categorical features
baf_cat = pd.concat([baf_data.iloc[:, 0:7], dummy, baf_data.iloc[:, 12]], axis=1)
baf_cat.head()

Unnamed: 0,fraud_bool,email_is_free,phone_home_valid,phone_mobile_valid,has_other_cards,foreign_request,keep_alive_session,payment_type_AA,payment_type_AB,payment_type_AC,...,housing_status_BF,housing_status_BG,source_INTERNET,source_TELEAPP,device_os_linux,device_os_macintosh,device_os_other,device_os_windows,device_os_x11,intended_balcon_amount_negative
0,1,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,1
1,1,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,1,0,1
2,1,1,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,1
3,1,1,0,1,1,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,1
4,1,1,1,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,0,1


In [63]:
# Concatenate baf_cat with numeric features
# baf_data_new = pd.concat([baf_cat, baf_data.iloc[:, 12:33]], axis=1)
# baf_data_new.head().T

## 4.4 Scale standardization of numeric features

In [64]:
# Subset the numeric features of baf_data to baf_num
baf_num = baf_data.iloc[:, 13:33]
baf_num.head().T

Unnamed: 0,0,1,2,3,4
income,0.9,0.9,0.9,0.9,0.9
name_email_similarity,0.166828,0.296286,0.044985,0.159511,0.596414
prev_address_months_count,,,,,
current_address_months_count,88.0,144.0,132.0,22.0,218.0
customer_age,50.0,50.0,40.0,50.0,50.0
days_since_request,0.020925,0.005418,3.108549,0.019079,0.004441
intended_balcon_amount,,,,,
zip_count_4w,769.0,366.0,870.0,810.0,890.0
velocity_6h,10650.765523,534.047319,4048.534263,3457.064063,5020.341679
velocity_24h,3134.31963,2670.918292,2893.621498,4054.908412,2728.237159


As the vaules of numeric features varies a lot between different features, we can standardize them by removing the mean and scaling to unit variance.

In [65]:
# Fit and transform numeric features with StandardScaler.
scaler = StandardScaler()
baf_num_scaled_np = scaler.fit_transform(baf_num)
baf_num_scaled = pd.DataFrame(baf_num_scaled_np, columns=scaler.get_feature_names_out())
baf_num_scaled.head()

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,credit_risk_score,bank_months_count,proposed_credit_limit,session_length_in_minutes,device_distinct_emails_8w,month
0,1.161537,-1.131217,,0.011773,1.356171,-0.185996,,-0.800194,1.655666,-1.10626,-1.080448,-0.399474,-0.695638,0.773974,0.792364,-0.033371,-0.457093,-0.116803,1.680733
1,1.161537,-0.683479,,0.645179,1.356171,-0.18889,,-1.200927,-1.706146,-1.419625,-1.884751,1.158412,-1.291593,1.835418,0.011521,2.016662,3.019439,-0.116803,1.680733
2,1.161537,-1.552615,,0.509449,0.524462,0.390118,,-0.699762,-0.538273,-1.269027,-1.846358,-0.399474,0.893573,0.659223,,-0.648381,-0.35239,-0.116803,1.680733
3,1.161537,-1.156522,,-0.73474,1.356171,-0.186341,,-0.759425,-0.73482,-0.483733,-1.995752,3.772271,-0.695638,-0.301814,1.399686,-0.648381,-0.686308,-0.116803,1.680733
4,1.161537,0.354529,,1.482178,1.356171,-0.189072,,-0.679875,-0.215339,-1.380865,-1.924596,3.922193,-1.490244,2.351796,1.399686,2.016662,-0.471154,-0.116803,1.680733


In [66]:
# Combine baf_cat and bat_num_scaled into new dataframe baf_new.
baf_new = pd.concat([baf_cat, baf_num_scaled], axis=1)
baf_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993563 entries, 0 to 993562
Data columns (total 53 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   fraud_bool                        993563 non-null  int64  
 1   email_is_free                     993563 non-null  int64  
 2   phone_home_valid                  993563 non-null  int64  
 3   phone_mobile_valid                993563 non-null  int64  
 4   has_other_cards                   993563 non-null  int64  
 5   foreign_request                   993563 non-null  int64  
 6   keep_alive_session                993563 non-null  int64  
 7   payment_type_AA                   993563 non-null  int64  
 8   payment_type_AB                   993563 non-null  int64  
 9   payment_type_AC                   993563 non-null  int64  
 10  payment_type_AD                   993563 non-null  int64  
 11  payment_type_AE                   993563 non-null  i

There are still missing values in 3 numeric columns. While their distribution was quite skewed, we will try different methods of imputation during modeling. Next we are going to split data into training and testing subsets first.

## 4.5 Split data into training and testing subsets

In [67]:
# split dataset into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(baf_new.drop(columns='fraud_bool'), baf_new.fraud_bool, test_size=0.3, random_state=47)

In [68]:
X_train.shape, X_test.shape

((695494, 52), (298069, 52))

In [69]:
y_train.shape, y_test.shape

((695494,), (298069,))

In [70]:
# Check dtypes of X_train
X_train.dtypes

email_is_free                         int64
phone_home_valid                      int64
phone_mobile_valid                    int64
has_other_cards                       int64
foreign_request                       int64
keep_alive_session                    int64
payment_type_AA                       int64
payment_type_AB                       int64
payment_type_AC                       int64
payment_type_AD                       int64
payment_type_AE                       int64
employment_status_CA                  int64
employment_status_CB                  int64
employment_status_CC                  int64
employment_status_CD                  int64
employment_status_CE                  int64
employment_status_CF                  int64
employment_status_CG                  int64
housing_status_BA                     int64
housing_status_BB                     int64
housing_status_BC                     int64
housing_status_BD                     int64
housing_status_BE               

The X_train features are all numeric. The dataset is preprocessed and ready for initial training.