## Import data and packages

In [215]:
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
import numpy as np 
import json
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder



# import data
df_base = pd.read_csv('data/Base.csv') 

# import feature data types
with open('python_scripts/data_types.json') as f:
    data_types = json.load(f) 

# encoding source {'INTERNET', 'TELEAPP'} into source_is_internet_not_teleapp
df_base['source_is_internet_not_teleapp'] = (df_base['source'] == 'INTERNET').astype(int)
df_base.drop('source', axis=1, inplace=True)


# recording which variables have missing values recorded as -1
missing_values = ['prev_address_months_count',
                  'current_address_months_count',
                  'bank_months_count',
                  'session_length_in_minutes',
                  'device_distinct_emails_8w']

### Divide dataset into training set and testing set

In [216]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_base.drop('fraud_bool', axis=1), df_base.fraud_bool, test_size=0.2, random_state=0)


### Deal with missing values

In [217]:
# device_distinct_emails_8w - impute missing values with mode (since it is categories, dont necessarily want middle value, just the most represented)
device_distinct_emails_8w_mode = X_train.device_distinct_emails_8w.value_counts().sort_values(ascending=False).index[0]

X_train.loc[X_train.device_distinct_emails_8w == -1, 'device_distinct_emails_8w'] = device_distinct_emails_8w_mode
X_test.loc[X_test.device_distinct_emails_8w == -1, 'device_distinct_emails_8w'] = device_distinct_emails_8w_mode


In [218]:
# session_length_in_minutes - impute missing values with median
session_length_in_minutes_median_position = int(np.ceil((len(X_train[X_train.session_length_in_minutes != -1])/2)-1))
session_length_in_minutes_median = sorted(X_train[X_train.session_length_in_minutes != -1].session_length_in_minutes)[session_length_in_minutes_median_position]

X_train.loc[X_train.session_length_in_minutes == -1, 'session_length_in_minutes'] = session_length_in_minutes_median
X_test.loc[X_test.session_length_in_minutes == -1, 'session_length_in_minutes'] = session_length_in_minutes_median


In [219]:
# drop bank_months_count, create binary feature for value provided or not
X_train['bank_months_count_provided'] = [0 if i == -1 else 1 for i in X_train.bank_months_count]
X_train.drop('bank_months_count', axis=1, inplace=True)

X_test['bank_months_count_provided'] = [0 if i == -1 else 1 for i in X_test.bank_months_count]
X_test.drop('bank_months_count', axis=1, inplace=True)

In [220]:
# drop prev_address_months_count (too many missing values), create binary feature for value provided or not
X_train['prev_address_months_count_provided'] = [0 if i == -1 else 1 for i in X_train.prev_address_months_count]
X_train.drop('prev_address_months_count', axis=1, inplace=True)

X_test['prev_address_months_count_provided'] = [0 if i == -1 else 1 for i in X_test.prev_address_months_count]
X_test.drop('prev_address_months_count', axis=1, inplace=True)

In [221]:
# current_address_months_count - impute missing values with median
X_train['current_address_months_count_provided'] = [0 if i == -1 else 1 for i in X_train.current_address_months_count]
X_test['current_address_months_count_provided'] = [0 if i == -1 else 1 for i in X_test.current_address_months_count]

# and create binary features for value provided or not

current_address_months_count_median_position = int(np.ceil((len(X_train[X_train['current_address_months_count'] != -1])/2)-1))
current_address_months_count_median = sorted(X_train[X_train['current_address_months_count'] != -1]['current_address_months_count'])[current_address_months_count_median_position]

X_train.loc[X_train['current_address_months_count'] == -1, 'current_address_months_count'] = current_address_months_count_median
X_test.loc[X_test['current_address_months_count'] == -1, 'current_address_months_count'] = current_address_months_count_median

### Encode and simplfy categorical features

In [222]:
# drop device_fraud_count (single value)
X_train.drop('device_fraud_count', axis=1, inplace=True)
X_test.drop('device_fraud_count', axis=1, inplace=True)

In [223]:
# simplying proposed credit limit to 5 categories
categories_to_keep = X_train.proposed_credit_limit.value_counts().index[:4] # taking 4 largest represented proposed limits
X_train['proposed_credit_limit'] = X_train['proposed_credit_limit'].where(X_train['proposed_credit_limit'].isin(categories_to_keep), 'other').astype(str)
X_test['proposed_credit_limit'] = X_test['proposed_credit_limit'].where(X_test['proposed_credit_limit'].isin(categories_to_keep), 'other').astype(str)

In [224]:
# one hot encoding
categories_to_encode = ['proposed_credit_limit', 'payment_type', 'employment_status', 'housing_status', 'device_os']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first') 
encoded_array = ohe.fit_transform(X_train[categories_to_encode])
encoded_df = pd.DataFrame(encoded_array, columns=ohe.get_feature_names_out())
X_train = pd.concat([X_train.drop(columns=categories_to_encode).reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

encoded_array_test = ohe.transform(X_test[categories_to_encode])
encoded_df_test = pd.DataFrame(encoded_array_test, columns=ohe.get_feature_names_out())
X_test = pd.concat([X_test.drop(columns=categories_to_encode).reset_index(drop=True), encoded_df_test.reset_index(drop=True)], axis=1)


In [225]:
X_train

Unnamed: 0,income,name_email_similarity,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,credit_risk_score,email_is_free,phone_home_valid,phone_mobile_valid,has_other_cards,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,month,source_is_internet_not_teleapp,bank_months_count_provided,prev_address_months_count_provided,current_address_months_count_provided,proposed_credit_limit_1500.0,proposed_credit_limit_200.0,proposed_credit_limit_500.0,proposed_credit_limit_other,payment_type_AB,payment_type_AC,payment_type_AD,payment_type_AE,employment_status_CB,employment_status_CC,employment_status_CD,employment_status_CE,employment_status_CF,employment_status_CG,housing_status_BB,housing_status_BC,housing_status_BD,housing_status_BE,housing_status_BF,housing_status_BG,device_os_macintosh,device_os_other,device_os_windows,device_os_x11
0,0.1,0.192523,54,20,0.022779,-0.810577,718,3784.358473,2912.626248,4206.784786,9,10,137,0,0,1,0,0,8.906903,1,1,5,1,1,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.9,0.238720,47,40,0.027388,-1.416575,923,9712.032941,4863.710966,6384.777776,1,5,50,1,0,1,0,0,6.791985,1,1,0,1,0,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.8,0.204426,92,30,0.018037,48.983949,1480,2765.174801,5248.159541,4237.976176,6,14,123,0,0,1,0,0,4.689416,0,1,5,1,1,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.6,0.904621,149,40,0.004834,-0.844309,938,6147.912418,4278.845661,5455.794500,16,8,191,1,1,1,0,0,11.227696,1,1,1,1,1,0,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.6,0.521190,44,30,0.011968,52.410502,1653,3098.367323,5289.339754,6455.672584,0,14,163,1,1,1,0,0,4.309028,1,1,0,1,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,0.7,0.615751,43,40,0.004316,49.874242,2097,4359.403360,2741.768532,6433.794944,2,15,109,0,1,1,0,0,1.813194,1,1,7,1,1,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
799996,0.4,0.789709,36,20,0.008571,12.828529,1762,6920.945581,6972.470268,5870.108156,12,10,189,0,0,1,0,0,9.506439,1,1,0,1,1,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
799997,0.7,0.405628,13,50,0.003374,-0.294296,1301,750.357870,2989.495690,4991.307731,1,8,219,1,0,1,0,0,5.243137,0,1,3,1,0,0,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
799998,0.4,0.591697,50,50,1.265285,35.793431,672,12353.087248,7375.660608,5359.857894,2,6,96,0,0,1,0,0,4.447468,0,1,2,1,1,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Scaling for numerical features

In [294]:
# test for skewness for possible log transformation
from scipy.stats import skew
numericals = data_types['numerical_continuous_bounded']+data_types['numerical_continuous_unbounded']+data_types['numerical_discrete']+['customer_age']
to_drop = ['prev_address_months_count', 'bank_months_count']
features = []
skews = []
min_values = []
for i in numericals:
    if i not in to_drop:
        features.append(i)
        skews.append(skew(X_train[i]))
        min_values.append(min(X_train[i]))
skews_df = pd.DataFrame({'Feature':features, 'Skewness':skews, 'Min':min_values}).sort_values('Skewness', ascending=False).set_index('Feature')
skews_df


Unnamed: 0_level_0,Skewness,Min
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
days_since_request,9.278334,4.03686e-09
session_length_in_minutes,3.315531,0.0008720274
bank_branch_count_8w,2.746926,0.0
intended_balcon_amount,2.505943,-15.53055
zip_count_4w,1.45484,1.0
current_address_months_count,1.389416,0.0
date_of_birth_distinct_emails_4w,0.703265,0.0
velocity_6h,0.563249,-155.4307
customer_age,0.479187,10.0
velocity_24h,0.329749,1300.307


In [296]:
significant_skews = list(skews_df[skews_df.Skewness >= 1].index)
# shift variables to push negative values above 0
shifted_df = {}
for i in significant_skews:
    min_value = skews_df.loc[i].Min 
    if min_value <1 :
        shifted_df[i] = X_train[i] + 1 + abs(min_value)
    else:
        shifted_df[i] = X_train[i]

# check if range of values spans 3 orders of magnitude (largest value is over 10^3 time the smallest value)
features = []
range_orders_of_magnitude = []
for i,j in shifted_df.items():
    log_min = np.log10(min(j))
    log_max = np.log10(max(j))
    range_orders_of_magnitude.append(log_max - log_min)
    features.append(i)
magnitude_of_ranges_df = pd.DataFrame({'Feature': features, 
                                       'range_orders_of_magnitude':range_orders_of_magnitude}).sort_values('range_orders_of_magnitude', ascending=False).set_index('Feature')
magnitude_of_ranges_df = magnitude_of_ranges_df.join(skews_df.drop('Min', axis=1), on='Feature', how='left')
magnitude_of_ranges_df

Unnamed: 0_level_0,range_orders_of_magnitude,Skewness
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
zip_count_4w,3.826075,1.45484
bank_branch_count_8w,3.37767,2.746926
current_address_months_count,2.62941,1.389416
intended_balcon_amount,2.112228,2.505943
session_length_in_minutes,1.938263,3.315531
days_since_request,1.900132,9.278334


`current_address_months_count` displays moderate skew and range below 3 orders of magnitude, so I will not log transform this variable

In [300]:
# executing log transformation
to_transform = list(magnitude_of_ranges_df.index.drop(['current_address_months_count']))


In [230]:
# apply log transformations to skewed data

t = ''' 
# Apply log transformation to the feature (use log1p to handle zero and negative values)
X_train['feature_name'] = np.log1p(X_train['feature_name'])  # log(x + 1)
X_test['feature_name'] = np.log1p(X_test['feature_name'])


After Transformation: After applying the log transformation, it’s a good idea to re-check the skewness and distribution. 
You should see that the distribution has become more symmetric and less skewed.
'''

In [233]:
# apply robust scaling
from sklearn.preprocessing import RobustScaler

# Initialize the RobustScaler
scaler = RobustScaler()

# Scale all numerical features in both train and test sets
# X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
# X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [234]:
# ready for training

