In [332]:
import pandas as pd 
import numpy as np 
import json
pd.set_option('display.max_columns', None)
# import custom packages for EDA
from EDA import FeatureSignificance, FeatureVisualisation 

# import data
df_base = pd.read_csv('data/Base.csv') 

# import feature data types
with open('data_types.json') as f:
    data_types = json.load(f) 

# encoding source {'INTERNET', 'TELEAPP'} into source_is_internet_not_teleapp
df_base['source_is_internet_not_teleapp'] = (df_base['source'] == 'INTERNET').astype(int)
df_base.drop('source', axis=1, inplace=True)

In [333]:
fs = FeatureSignificance(df_base, target='fraud_bool')
fv = FeatureVisualisation(df_base, target='fraud_bool')

In [334]:
fs.calculations()

temporal                        [Spearman Correlation, Chi-Square Test (Goodness of Fit)]
ordinal                           [Spearman Correlation, Logistic Regression Coefficient]
numerical_discrete                                   [Point-Biserial Correlation, T-Test]
numerical_continuous                             [Spearman Correlation, T-Test, K-S Test]
nominal_multi_category        [Chi-Square Test of Independence (Contingency), Cramér’s V]
nominal_binary            [Chi-Square Test of Independence (Contingency), Point-Biserial]
dtype: object


In [335]:
fs.numerical_discrete(data_types['numerical_discrete'])



Unnamed: 0_level_0,Point-Biserial Correlation,p-value,t_stat,t_stat p_value
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
credit_risk_score,0.070588,0.0,-60.091371,0.0
proposed_credit_limit,0.068907,0.0,-52.350791,0.0
device_distinct_emails_8w,0.036542,1.921787e-292,-20.377266,1.199204e-90
current_address_months_count,0.03348,7.586131999999999e-245,-34.620596,1.082672e-249
bank_months_count,0.020929,4.31288e-73,-17.692433,1.4586840000000001e-68
prev_address_months_count,0.020083,5.2381910000000006e-27,-8.357725,2.417317e-16
zip_count_4w,0.005212,1.868284e-07,-5.210355,1.918071e-07
bank_branch_count_8w,-0.011577,5.39759e-31,12.764196,4.674261e-37
date_of_birth_distinct_emails_4w,-0.043224,0.0,44.852802,0.0
device_fraud_count,,,,


In [336]:
fs.numerical_continuous(data_types['numerical_continuous_bounded'] + data_types['numerical_continuous_unbounded'])

Unnamed: 0_level_0,Spearman Correlation,Spearman p-value,T-Statistic,T-Test p-value,K-S Statistic,K-S p-value
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
session_length_in_minutes,0.002058,0.03981677,-7.45025,1.000315e-13,0.027061,2.318452e-07
velocity_24h,-0.010509,7.791899e-26,11.511446,1.709035e-30,0.047764,4.602355e-22
velocity_4w,-0.013524,1.118804e-41,10.882405,1.915534e-27,0.071602,4.519496e-49
days_since_request,-0.014209,7.956264e-46,-0.535197,0.592524,0.069237,6.583571e-46
velocity_6h,-0.016497,3.816254e-61,17.508297,9.682576e-68,0.063211,2.493909e-38
intended_balcon_amount,-0.017954,4.326131e-72,29.776915,1.0706489999999999e-187,0.143678,6.122341e-197
name_email_similarity,-0.037283,1.936639e-304,35.923329,1.167023e-267,0.185558,0.0


In [337]:
fs.nominal_multi_category(data_types['nominal_multi_category'])

Unnamed: 0_level_0,Chi Square Statistic,p-value (Chi-Square),cramers v
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
housing_status,13202.787719,0.0,0.114903
device_os,6478.945928,0.0,0.080492
employment_status,1572.499001,0.0,0.039655
payment_type,1528.34201,0.0,0.039094


In [338]:
fs.nominal_binary(data_types['nominal_binary'])

Unnamed: 0_level_0,Chi Square Statistic,p-value (Chi-Square),Point-Biserial Correlation,p-value (Point-Biserial)
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
email_is_free,769.952405,1.841219e-169,0.027758,1.216719e-169
foreign_request,284.060841,9.787931999999999e-64,0.016885,5.72273e-64
source_is_internet_not_teleapp,14.937804,0.000111114,-0.003922,8.774983e-05
phone_mobile_valid,173.301368,1.406547e-39,-0.01318,1.1403520000000001e-39
phone_home_valid,1233.281326,3.567963e-270,-0.035128,1.7348989999999999e-270
has_other_cards,1235.161652,1.392459e-270,-0.035156,6.347503e-271
keep_alive_session,2528.754445,0.0,-0.050296,0.0


In [339]:
fs.ordinal(data_types['ordinal'])

Unnamed: 0_level_0,Spearman Correlation,Spearman p-value,Log Regression coef
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
income,0.049583,0.0,1.711414
customer_age,0.058146,0.0,0.04505


In [340]:
fs.temporal(data_types['temporal'])

Unnamed: 0_level_0,Spearman Correlation,p-value (Spearman),Chi-Square Statistic,p-value (Chi-Square)
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month,0.012949,2.362542e-38,330.160493,2.166747e-67



### next steps
#### significance testing
    - feature sub categories
    - are missing values indicative of target variable?


#### visualisations
    - is there an order? A to G for the following variables, can they be treated as ordinal:
        - housing_status
        - employment_status

#### Automated Feature Selection
    - After testing significance, use methods like Recursive Feature Elimination (RFE) or SHAP values for machine learning-based importance ranking.

