In [1]:
import os
import math
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from utils_feature_engineering import *

#### Load data

In [2]:
# path
path = 'DATA RAW'
# application_train
application_train_path = f'../{path}/dseb63_application_train.csv'
application_train = pd.read_csv(application_train_path)
application_train.drop(columns='Unnamed: 0', inplace=True)

# application_test
application_test_path = f'../{path}/dseb63_application_test.csv'
application_test = pd.read_csv(application_test_path)
application_test.drop(columns='Unnamed: 0', inplace=True)


##### Load train + tvt = train_filtered for features evaluation
##### Load train/test for applying mean encoding

In [3]:
# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
application_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
                      .merge(application_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                      .drop(columns=["tvt_code"]))
application_train_filtered.head()

# Chỉ giữ lại cột SK_ID_CURR và biến target
# application_train_filtered = application_train[['SK_ID_CURR', 'TARGET']]
# application_train_filtered

Unnamed: 0,SK_ID_CURR,TARGET
0,278621,0
1,139008,0
2,138348,0
3,53466,0
4,27272,0


In [4]:
# credit_card_balance 
credit_card_balance_path = f'../{path}/dseb63_credit_card_balance.csv'
credit_card_balance = pd.read_csv(f'../{path}/dseb63_credit_card_balance.csv')
credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,...,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,SK_ID_CURR
0,2582071,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,...,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0,87788
1,2582071,-82,16809.21,67500,0.0,0.0,0.0,0.0,3375.0,9000.0,...,16809.21,0.0,0,0.0,0.0,18.0,Active,0,0,87788
2,2582071,-84,27577.89,67500,0.0,0.0,0.0,0.0,3375.0,4500.0,...,27577.89,0.0,0,0.0,0.0,16.0,Active,0,0,87788
3,2582071,-7,65159.235,45000,0.0,0.0,0.0,0.0,2250.0,2250.0,...,65609.235,0.0,0,0.0,0.0,63.0,Active,0,0,87788
4,2582071,-59,70475.85,67500,24750.0,24750.0,0.0,0.0,3375.0,4500.0,...,70475.85,4.0,4,0.0,0.0,41.0,Active,0,0,87788


#### Preprocessing data

##### Handling SK_DPD & SK_DPD_DEF colums

In [5]:
# Create the "is_DPD" column based on the condition "SK_DPD > 0"
credit_card_balance["is_DPD"] = (credit_card_balance["SK_DPD"] > 0).astype('int64')

# Create the "is_DPD_DEF" column based on the condition "SK_DPD_DEF > 0"
credit_card_balance["is_DPD_DEF"] = (credit_card_balance["SK_DPD_DEF"] > 0).astype('int64')

# Remove the "SK_DPD" and "SK_DPD_DEF" columns from the DataFrame
credit_card_balance.drop(columns=["SK_DPD", "SK_DPD_DEF"], inplace=True)

In [6]:
null_cols = credit_card_balance.isnull().sum()
null_cols[null_cols>0].sort_values(ascending=False)

AMT_PAYMENT_CURRENT           620093
AMT_DRAWINGS_ATM_CURRENT      605754
AMT_DRAWINGS_OTHER_CURRENT    605754
AMT_DRAWINGS_POS_CURRENT      605754
CNT_DRAWINGS_ATM_CURRENT      605754
CNT_DRAWINGS_OTHER_CURRENT    605754
CNT_DRAWINGS_POS_CURRENT      605754
AMT_INST_MIN_REGULARITY       264384
CNT_INSTALMENT_MATURE_CUM     264384
dtype: int64

In [7]:
credit_card_balance.fillna(0, inplace=True)

In [8]:
credit_card_balance.isnull().sum()

SK_ID_PREV                    0
MONTHS_BALANCE                0
AMT_BALANCE                   0
AMT_CREDIT_LIMIT_ACTUAL       0
AMT_DRAWINGS_ATM_CURRENT      0
AMT_DRAWINGS_CURRENT          0
AMT_DRAWINGS_OTHER_CURRENT    0
AMT_DRAWINGS_POS_CURRENT      0
AMT_INST_MIN_REGULARITY       0
AMT_PAYMENT_CURRENT           0
AMT_PAYMENT_TOTAL_CURRENT     0
AMT_RECEIVABLE_PRINCIPAL      0
AMT_RECIVABLE                 0
AMT_TOTAL_RECEIVABLE          0
CNT_DRAWINGS_ATM_CURRENT      0
CNT_DRAWINGS_CURRENT          0
CNT_DRAWINGS_OTHER_CURRENT    0
CNT_DRAWINGS_POS_CURRENT      0
CNT_INSTALMENT_MATURE_CUM     0
NAME_CONTRACT_STATUS          0
SK_ID_CURR                    0
is_DPD                        0
is_DPD_DEF                    0
dtype: int64

#### Categorical features

In [9]:
# Get the list of categorical columns
series_type = credit_card_balance.dtypes
list_categorical = series_type[series_type == "object"].index.tolist()
list_categorical


['NAME_CONTRACT_STATUS']

In [10]:
# construct categorical mapping
dict_onehot = {}
for cate in list_categorical:
    list_val = credit_card_balance[cate].value_counts().index.tolist()
    dict_onehot[cate] = list_val

In [11]:
dict_onehot

{'NAME_CONTRACT_STATUS': ['Active',
  'Completed',
  'Signed',
  'Demand',
  'Sent proposal',
  'Refused',
  'Approved']}

In [12]:
# Create one-hot encoded features from the credit_card_balance DataFrame
pdf_onehot = gen_one_hot_feat(credit_card_balance, dict_onehot, main_key="SK_ID_CURR")
print(pdf_onehot.shape)

(3227965, 8)


In [13]:
# Aggregate data
pdf_agg01 = agg_common_data(pdf_onehot, ["max", "sum", "mean"], main_key="SK_ID_CURR")

# Evaluate features of the aggregated data and save the results in the eval_agg01 variable
eval_agg01 = feature_evaluate(application_train_filtered, pdf_agg01)

{'NAME_CONTRACT_STATUS_Active': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Completed': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Signed': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Demand': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Sent_proposal': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Refused': ['max', 'sum', 'mean'],
 'NAME_CONTRACT_STATUS_Approved': ['max', 'sum', 'mean']}

After agg: (86905, 21)


In [14]:
eval_agg01

Unnamed: 0,name,auc,corr,coverage
1,NAME_CONTRACT_STATUS_Active_sum,0.554349,-0.057989,1.0
2,NAME_CONTRACT_STATUS_Active_mean,0.518745,0.023592,1.0
5,NAME_CONTRACT_STATUS_Completed_mean,0.516364,-0.026312,1.0
4,NAME_CONTRACT_STATUS_Completed_sum,0.516303,-0.023113,1.0
3,NAME_CONTRACT_STATUS_Completed_max,0.516067,-0.027396,1.0
7,NAME_CONTRACT_STATUS_Signed_sum,0.502535,-0.000976,1.0
6,NAME_CONTRACT_STATUS_Signed_max,0.502471,-0.006397,1.0
8,NAME_CONTRACT_STATUS_Signed_mean,0.502335,0.004075,1.0
14,NAME_CONTRACT_STATUS_Sent_proposal_mean,0.501365,-0.011336,1.0
12,NAME_CONTRACT_STATUS_Sent_proposal_max,0.501362,-0.010759,1.0


In [15]:
# Filter features with AUC (Area Under the ROC Curve) less than or equal to 0.501 and measure the size of the result
filtered_features = eval_agg01.query("auc <= 0.501")
print(filtered_features.shape)

(10, 4)


In [16]:
# Select features with AUC (Area Under the ROC Curve) greater than 0.501 from the evaluation result
selected_features = eval_agg01.query("auc > 0.501")["name"].tolist()

# Filter the pdf_agg01 DataFrame to keep only the selected features
pdf_agg01 = pdf_agg01[selected_features]
print(pdf_agg01.shape)

(86905, 11)


#### Numerical features

In [17]:
# get list numerical attributes
series_type = credit_card_balance.dtypes
ls_num = series_type[series_type == "int64"].index.tolist()
ls_num = [col for col in ls_num if col not in ["SK_ID_PREV", "SK_ID_CURR"]]
ls_num

['MONTHS_BALANCE',
 'AMT_CREDIT_LIMIT_ACTUAL',
 'CNT_DRAWINGS_CURRENT',
 'is_DPD',
 'is_DPD_DEF']

In [18]:
# Create a new DataFrame containing specific columns from the original pdf_data DataFrame
pdf_num = credit_card_balance[["SK_ID_PREV", "SK_ID_CURR"] + ls_num].copy()

pdf_num.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,CNT_DRAWINGS_CURRENT,is_DPD,is_DPD_DEF
0,2582071,87788,-1,45000,1,0,0
1,2582071,87788,-82,67500,0,0,0
2,2582071,87788,-84,67500,0,0,0
3,2582071,87788,-7,45000,0,0,0
4,2582071,87788,-59,67500,4,0,0


In [19]:
# Reverse the sign of the "MONTHS_BALANCE" column in the pdf_num DataFrame
pdf_num["MONTHS_BALANCE"] = pdf_num["MONTHS_BALANCE"] * -1

In [20]:
# Aggregate data
pdf_agg02 = agg_common_data(pdf_num[["SK_ID_CURR"] + ls_num], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")

# Evaluate features of the aggregated data and save the results in the eval_agg02 variable
eval_agg02 = feature_evaluate(application_train_filtered, pdf_agg02)

{'MONTHS_BALANCE': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_CREDIT_LIMIT_ACTUAL': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_DRAWINGS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'is_DPD': ['max', 'min', 'sum', 'mean', 'std'],
 'is_DPD_DEF': ['max', 'min', 'sum', 'mean', 'std']}

After agg: (86905, 25)


In [21]:
eval_agg02

Unnamed: 0,name,auc,corr,coverage
14,CNT_DRAWINGS_CURRENT_std,0.626664,0.113388,0.992666
13,CNT_DRAWINGS_CURRENT_mean,0.625881,0.086031,1.0
10,CNT_DRAWINGS_CURRENT_max,0.61533,0.10565,1.0
12,CNT_DRAWINGS_CURRENT_sum,0.596246,0.053039,1.0
3,MONTHS_BALANCE_mean,0.562136,-0.0609,1.0
0,MONTHS_BALANCE_max,0.560056,-0.060163,1.0
2,MONTHS_BALANCE_sum,0.559231,-0.0579,1.0
4,MONTHS_BALANCE_std,0.557682,-0.059378,0.992666
7,AMT_CREDIT_LIMIT_ACTUAL_sum,0.549694,-0.04639,1.0
1,MONTHS_BALANCE_min,0.523282,-0.028148,1.0


#### Continuous features

In [22]:
# get list continuous attributes
series_type = credit_card_balance.dtypes
ls_con = series_type[series_type == "float64"].index.tolist()
ls_con

['AMT_BALANCE',
 'AMT_DRAWINGS_ATM_CURRENT',
 'AMT_DRAWINGS_CURRENT',
 'AMT_DRAWINGS_OTHER_CURRENT',
 'AMT_DRAWINGS_POS_CURRENT',
 'AMT_INST_MIN_REGULARITY',
 'AMT_PAYMENT_CURRENT',
 'AMT_PAYMENT_TOTAL_CURRENT',
 'AMT_RECEIVABLE_PRINCIPAL',
 'AMT_RECIVABLE',
 'AMT_TOTAL_RECEIVABLE',
 'CNT_DRAWINGS_ATM_CURRENT',
 'CNT_DRAWINGS_OTHER_CURRENT',
 'CNT_DRAWINGS_POS_CURRENT',
 'CNT_INSTALMENT_MATURE_CUM']

In [23]:
# Create a new DataFrame containing specific columns from the original pdf_data DataFrame
pdf_con = credit_card_balance[["SK_ID_PREV", "SK_ID_CURR"] + ls_con].copy()

print(pdf_con.shape)

(3227965, 17)


In [24]:
# Aggregate data
pdf_agg03 = agg_common_data(pdf_con[["SK_ID_CURR"] + ls_con], ["max", "min", "sum", "mean", "std"], main_key="SK_ID_CURR")

# Evaluate features of the aggregated data and save the results in the eval_agg03 variable
eval_agg03 = feature_evaluate(application_train_filtered, pdf_agg03)


{'AMT_BALANCE': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_DRAWINGS_ATM_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_DRAWINGS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_DRAWINGS_OTHER_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_DRAWINGS_POS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_INST_MIN_REGULARITY': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_PAYMENT_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_PAYMENT_TOTAL_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_RECEIVABLE_PRINCIPAL': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_RECIVABLE': ['max', 'min', 'sum', 'mean', 'std'],
 'AMT_TOTAL_RECEIVABLE': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_DRAWINGS_ATM_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_DRAWINGS_OTHER_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_DRAWINGS_POS_CURRENT': ['max', 'min', 'sum', 'mean', 'std'],
 'CNT_INSTALMENT_MATURE_CUM': ['max', 'min', 'sum', 'mean', 'std']}

After agg: (86905, 75)


In [25]:
eval_agg03

Unnamed: 0,name,auc,corr,coverage
58,CNT_DRAWINGS_ATM_CURRENT_mean,0.620115,0.117511,1.000000
59,CNT_DRAWINGS_ATM_CURRENT_std,0.615169,0.119314,0.992666
3,AMT_BALANCE_mean,0.609751,0.091940,1.000000
13,AMT_DRAWINGS_CURRENT_mean,0.609688,0.057302,1.000000
43,AMT_RECEIVABLE_PRINCIPAL_mean,0.608929,0.090737,1.000000
...,...,...,...,...
62,CNT_DRAWINGS_OTHER_CURRENT_sum,0.501715,-0.003638,1.000000
26,AMT_INST_MIN_REGULARITY_min,0.500203,0.002546,1.000000
31,AMT_PAYMENT_CURRENT_min,0.500091,-0.000185,1.000000
16,AMT_DRAWINGS_OTHER_CURRENT_min,0.500022,-0.001518,1.000000


In [26]:
# Filter features with AUC (Area Under the ROC Curve) less than or equal to 0.501 and measure the size of the result
filtered_features_agg03 = eval_agg03.query("auc <= 0.501")
print(filtered_features_agg03.shape)


(4, 4)


#### Save features

In [27]:
# Combine the DataFrames pdf_agg01, pdf_agg02, and pdf_agg03 into a new DataFrame pdf_feat
pdf_feat = pdf_agg01.join(pdf_agg02).join(pdf_agg03)

print(pdf_feat.shape)

(86905, 111)


In [28]:
fname = "credit_card_balance"
fname = os.path.join("features", "{}.pkl.bz2".format(fname))
pdf_feat.to_pickle(fname, compression="bz2")
print("Store features completed!")

Store features completed!
