In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [3]:
file = r"datasets/financial_data_with_targets.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,index,year,month,hour,txn_type,txn_status,error_code,remitter_bank,beneficiary_bank,payer_handle,...,payee_app,payee_requested_amount,payee_settlement_amount,difference_amount,payer_state,payee_state,cred_type,cred_subtype,time_of_day,targets
0,0,2023,8,0,Refund,Successful,0,Allahabad Bank,Karur Vysya Bank,SCB,...,BHIM KOTAK Pay,54020,54020,0,Punjab,Maharashtra,Debit Card,Prepaid Debit Card,LateNight,0
1,1,2021,10,0,Payment,Successful,0,Madhya Bihar Gramin Bank,Kotak Mahindra Bank,WASBI,...,JustDial,37670,37670,0,Uttar Pradesh,Rajasthan,Overdraft,Business Overdraft,LateNight,0
2,2,2019,11,0,Withdrawal,Successful,0,Karur Vysya Bank,United Bank of India,KMBL,...,UTKARSHBANK,22984,22984,0,Haryana,Punjab,Auto Loan,Used Car Loan,LateNight,0
3,3,2023,9,0,Transfer,Successful,0,HDFC Bank,Corporation Bank,IDBI,...,WhatsApp Pay,62038,62038,0,Punjab,Goa,Overdraft,Personal Overdraft,LateNight,0
4,4,2021,9,0,Fee,Successful,0,Union Bank of India,Bank of India,UNIONBANK,...,NSDL,72624,72624,0,Odisha,Maharashtra,Personal Loan,Unsecured Personal Loan,LateNight,0


In [4]:
cat_feat_to_keep = ['time_of_day', 'cred_type', 'error_code']
num_feat_to_keep = ['payee_requested_amount', 'payee_settlement_amount', 'difference_amount', 'targets']
feat_to_keep = cat_feat_to_keep + num_feat_to_keep
feat_to_keep

['time_of_day',
 'cred_type',
 'error_code',
 'payee_requested_amount',
 'payee_settlement_amount',
 'difference_amount',
 'targets']

In [5]:
df = df[feat_to_keep]
df

Unnamed: 0,time_of_day,cred_type,error_code,payee_requested_amount,payee_settlement_amount,difference_amount,targets
0,LateNight,Debit Card,0,54020,54020,0,0
1,LateNight,Overdraft,0,37670,37670,0,0
2,LateNight,Auto Loan,0,22984,22984,0,0
3,LateNight,Overdraft,0,62038,62038,0,0
4,LateNight,Personal Loan,0,72624,72624,0,0
...,...,...,...,...,...,...,...
55666,Night,Debit Card,U88,61242,61242,0,0
55667,Night,Debit Card,0,94378,94378,0,0
55668,Night,Auto Loan,0,10826,10826,0,0
55669,Night,Home Loan,0,14768,14768,0,0


In [6]:
df_cat = df[cat_feat_to_keep]

In [7]:
df_cat_encoded = pd.get_dummies(df_cat, drop_first= True).astype(np.int32)
df_cat_encoded

Unnamed: 0,time_of_day_EarlyMorning,time_of_day_Evening,time_of_day_LateAfternoon,time_of_day_LateMorning,time_of_day_LateNight,time_of_day_Morning,time_of_day_Night,cred_type_Credit Card,cred_type_Debit Card,cred_type_Home Loan,...,error_code_U85,error_code_U86,error_code_U88,error_code_U89,error_code_U90,error_code_U91,error_code_U92,error_code_U93,error_code_U94,error_code_U96
0,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55666,0,0,0,0,0,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
55667,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
55668,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55669,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_num = df[num_feat_to_keep].iloc[:, : -1]

In [9]:
scaler = StandardScaler()
cols = [i for i in num_feat_to_keep if i != 'targets']
df_num_scaled = scaler.fit_transform(df_num)
df_num_scaled = pd.DataFrame(df_num_scaled, columns= cols)
df_num_scaled

Unnamed: 0,payee_requested_amount,payee_settlement_amount,difference_amount
0,0.130872,0.046728,0.200272
1,-0.432866,-0.459216,0.200272
2,-0.939230,-0.913669,0.200272
3,0.407328,0.294842,0.200272
4,0.772327,0.622422,0.200272
...,...,...,...
55666,0.379882,0.270210,0.200272
55667,1.522391,1.295591,0.200272
55668,-1.358431,-1.289894,0.200272
55669,-1.222513,-1.167910,0.200272


In [10]:
data = pd.concat([df_cat_encoded, df_num_scaled, df.iloc[:, -1]], axis= 1)
data

Unnamed: 0,time_of_day_EarlyMorning,time_of_day_Evening,time_of_day_LateAfternoon,time_of_day_LateMorning,time_of_day_LateNight,time_of_day_Morning,time_of_day_Night,cred_type_Credit Card,cred_type_Debit Card,cred_type_Home Loan,...,error_code_U90,error_code_U91,error_code_U92,error_code_U93,error_code_U94,error_code_U96,payee_requested_amount,payee_settlement_amount,difference_amount,targets
0,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0.130872,0.046728,0.200272,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,-0.432866,-0.459216,0.200272,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,-0.939230,-0.913669,0.200272,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.407328,0.294842,0.200272,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.772327,0.622422,0.200272,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55666,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0.379882,0.270210,0.200272,0
55667,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1.522391,1.295591,0.200272,0
55668,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,-1.358431,-1.289894,0.200272,0
55669,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,-1.222513,-1.167910,0.200272,0


In [11]:
df.targets.sum()

3585

since the dataset is imbalanced (only 3585 targets are 1 out of total targets 55671) therefore we use SMOTE to create synthetic data of the minority class

In [14]:
smote = SMOTE(random_state= 42)

X_data_smote, y_data_smote = smote.fit_resample(data.iloc[:, :-1], data.iloc[:, -1])

In [19]:
data = pd.concat([X_data_smote, y_data_smote], axis= 1)

In [20]:
data

Unnamed: 0,time_of_day_EarlyMorning,time_of_day_Evening,time_of_day_LateAfternoon,time_of_day_LateMorning,time_of_day_LateNight,time_of_day_Morning,time_of_day_Night,cred_type_Credit Card,cred_type_Debit Card,cred_type_Home Loan,...,error_code_U90,error_code_U91,error_code_U92,error_code_U93,error_code_U94,error_code_U96,payee_requested_amount,payee_settlement_amount,difference_amount,targets
0,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0.130872,0.046728,0.200272,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,-0.432866,-0.459216,0.200272,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,-0.939230,-0.913669,0.200272,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.407328,0.294842,0.200272,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.772327,0.622422,0.200272,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104167,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,-0.420178,-0.447829,0.200272,1
104168,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.819955,0.665168,0.200272,1
104169,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,-1.649068,-1.547940,0.192358,1
104170,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1.321239,1.115060,0.200272,1


split into train, validation and test sets

In [23]:
# Assuming 'data' is your preprocessed DataFrame with 48 columns
# 47 input features and the 48th column as the target
inputs = data.iloc[:, :-1].values  # all columns except the last one
targets = data.iloc[:, -1].values  # the last column

# Split the data into training (80%) and temp (20%) sets
train_inputs, temp_inputs, train_targets, temp_targets = train_test_split(
    inputs, targets, test_size=0.2, random_state=42
)

# Split the temp set into validation (50% of temp = 10% of original) and test (50% of temp = 10% of original) sets
validation_inputs, test_inputs, validation_targets, test_targets = train_test_split(
    temp_inputs, temp_targets, test_size=0.5, random_state=42
)

# Save the splits in .npz format
np.savez('financial_data_train', inputs=train_inputs, targets=train_targets)
np.savez('financial_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('financial_data_test', inputs=test_inputs, targets=test_targets)