In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score,classification_report,roc_curve,auc, f1_score

import import_ipynb
from function_for_eda import *

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


importing Jupyter notebook from function_for_eda.ipynb


# 5. POS_CASH_balance.csv
Table pos_cash_balance chứa Monthly Balance Snapshots of previous Point of Sales and Cash Loans của các khoản vay trước đây của khách hàng tại Home Credit. Từ các bước EDA trước đó, chúng ta tiến hành các bước Feature Engineering như sau:
- <b>Bước 1</b>: Giống với bảng bureau_balance, dữ liệu ở bảng pos_cash có liên quan đến time series, nên chúng ta có thể tạo thêm feature EWM từ CNT_INSTALLMENT và CNT_FUTURE features và một số feature khác
- <b>Bước 2</b>: Thực hiện các aggregations thông qua SK_ID_PREV bằng các phép aggregation như min, max, sum, count,....Đầu tiên nhóm 12 aggregate trên toàn bộ data, sau đó aggregate những bản ghi trong 2 năm gần nhất và và aggregate qua những năm còn lại. Cuối cùng là aggregate data qua column Contract_Type là Active và Completed. Việc lựa chọn các phép aggregation dựa trên domain knowledge, quá trình eda, cũng như một số solutions tham khảo khác
- <b>Bước 3</b>: Cuối cùng là thực hiện aggregation trên toàn bộ data qua SK_ID_CURR


In [2]:
pos_cash = pd.read_csv('./dseb63_final_project_DP_dataset/dseb63_POS_CASH_balance.csv')
pos_cash

FileNotFoundError: [Errno 2] No such file or directory: './dseb63_final_project_DP_dataset/dseb63_POS_CASH_balance.csv'

In [None]:
#making the MONTHS_BALANCE Positive
pos_cash['MONTHS_BALANCE'] = np.abs(pos_cash['MONTHS_BALANCE'])
#sorting the DataFrame according to the month of status from oldest to latest, for rolling computations
pos_cash = pos_cash.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'], ascending=False)

#computing Exponential Moving Average for some features based on MONTHS_BALANCE
columns_for_ema = ['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE']
exp_columns = ['EXP_'+ele for ele in columns_for_ema]
pos_cash[exp_columns] = pos_cash.groupby('SK_ID_PREV')[columns_for_ema].transform(lambda x: x.ewm(alpha = 0.6).mean())

#creating new features based on Domain Knowledge
pos_cash['SK_DPD_RATIO'] = pos_cash['SK_DPD'] / (pos_cash['SK_DPD_DEF'] + 0.00001)
pos_cash['TOTAL_TERM'] = pos_cash['CNT_INSTALMENT'] + pos_cash['CNT_INSTALMENT_FUTURE']
pos_cash['EXP_POS_TOTAL_TERM'] = pos_cash['EXP_CNT_INSTALMENT'] + pos_cash['EXP_CNT_INSTALMENT_FUTURE']


In [None]:
overall_aggregations = {
            'SK_ID_CURR' : ['first'],
            'MONTHS_BALANCE' : ['max'],
            'CNT_INSTALMENT' : ['mean', 'max','min'],
            'CNT_INSTALMENT_FUTURE' : ['mean','max','min'],
            'SK_DPD' : ['max','sum'],
            'SK_DPD_DEF' : ['max','sum'],
            'EXP_CNT_INSTALMENT' : ['last'],
            'EXP_CNT_INSTALMENT_FUTURE' : ['last'],
            'SK_DPD_RATIO' : ['mean','max'],
            'TOTAL_TERM' : ['mean','max','last'],
            'EXP_POS_TOTAL_TERM' : ['mean'] 
        }
aggregations_for_year = {
    'CNT_INSTALMENT' : ['mean', 'max','min'],
    'CNT_INSTALMENT_FUTURE' : ['mean','max','min'],
    'SK_DPD' : ['max','sum'],
    'SK_DPD_DEF' : ['max','sum'],
    'EXP_CNT_INSTALMENT' : ['last'],
    'EXP_CNT_INSTALMENT_FUTURE' : ['last'],
    'SK_DPD_RATIO' : ['mean','max'],
    'TOTAL_TERM' : ['mean','max'],
    'EXP_POS_TOTAL_TERM' : ['last'] 
}
aggregations_for_categories = {
    'CNT_INSTALMENT' : ['mean', 'max','min'],
    'CNT_INSTALMENT_FUTURE' : ['mean','max','min'],
    'SK_DPD' : ['max','sum'],
    'SK_DPD_DEF' : ['max','sum'],
    'EXP_CNT_INSTALMENT' : ['last'],
    'EXP_CNT_INSTALMENT_FUTURE' : ['last'],
    'SK_DPD_RATIO' : ['mean','max'],
    'TOTAL_TERM' : ['mean','max'],
    'EXP_POS_TOTAL_TERM' : ['last']
}

In [None]:
pos_cash_aggregated_overall = pos_cash.groupby('SK_ID_PREV').agg(overall_aggregations)
pos_cash_aggregated_overall.columns = ['_'.join(ele).upper() for ele in pos_cash_aggregated_overall.columns]
pos_cash_aggregated_overall.rename(columns = {'SK_ID_CURR_FIRST': 'SK_ID_CURR'}, inplace = True)

#yearwise aggregations
pos_cash['YEAR_BALANCE'] = pos_cash['MONTHS_BALANCE'] //12
#aggregating over SK_ID_PREV for each last 2 years
pos_cash_aggregated_year = pd.DataFrame()
for year in range(2):
    group = pos_cash[pos_cash['YEAR_BALANCE'] == year].groupby('SK_ID_PREV').agg(aggregations_for_year)
    group.columns = ['_'.join(ele).upper() + '_YEAR_' + str(year) for ele in group.columns]
    if year == 0:
        pos_cash_aggregated_year = group
    else:
        pos_cash_aggregated_year = pos_cash_aggregated_year.merge(group, on = 'SK_ID_PREV', how = 'outer')

#aggregating over SK_ID_PREV for rest of the years
pos_cash_aggregated_rest_years = pos_cash[pos_cash['YEAR_BALANCE'] >= 2].groupby('SK_ID_PREV').agg(aggregations_for_year)
pos_cash_aggregated_rest_years.columns = ['_'.join(ele).upper() + '_YEAR_REST' for ele in pos_cash_aggregated_rest_years.columns]
#merging all the years aggregations
pos_cash_aggregated_year = pos_cash_aggregated_year.merge(pos_cash_aggregated_rest_years, on = 'SK_ID_PREV', how = 'outer')
pos_cash = pos_cash.drop(['YEAR_BALANCE'], axis = 1)

#aggregating over SK_ID_PREV for each of NAME_CONTRACT_STATUS categories
contract_type_categories = ['Active', 'Completed']
pos_cash_aggregated_contract = pd.DataFrame()
for i, contract_type in enumerate(contract_type_categories):
    group = pos_cash[pos_cash['NAME_CONTRACT_STATUS'] == contract_type].groupby('SK_ID_PREV').agg(aggregations_for_categories)
    group.columns = ['_'.join(ele).upper() + '_' + contract_type.upper() for ele in group.columns]
    if i == 0:
        pos_cash_aggregated_contract = group
    else:
        pos_cash_aggregated_contract = pos_cash_aggregated_contract.merge(group, on = 'SK_ID_PREV', how = 'outer')

pos_cash_aggregated_rest_contract = pos_cash[(pos_cash['NAME_CONTRACT_STATUS'] != 'Active') & 
                                                              (pos_cash['NAME_CONTRACT_STATUS'] != 'Completed')].groupby('SK_ID_PREV').agg(aggregations_for_categories)
pos_cash_aggregated_rest_contract.columns = ['_'.join(ele).upper() + '_REST' for ele in pos_cash_aggregated_rest_contract.columns]
#merging the categorical aggregations
pos_cash_aggregated_contract = pos_cash_aggregated_contract.merge(pos_cash_aggregated_rest_contract, on = 'SK_ID_PREV', how = 'outer')    

#merging all the aggregations
pos_cash_aggregated = pos_cash_aggregated_overall.merge(pos_cash_aggregated_year, on = 'SK_ID_PREV', how = 'outer')
pos_cash_aggregated = pos_cash_aggregated.merge(pos_cash_aggregated_contract, on = 'SK_ID_PREV', how = 'outer')

#onehot encoding the categorical feature NAME_CONTRACT_TYPE
name_contract_dummies = pd.get_dummies(pos_cash['NAME_CONTRACT_STATUS'], prefix='CONTRACT')
contract_names = name_contract_dummies.columns.tolist()
#concatenating one-hot encoded categories with main table
pos_cash = pd.concat([pos_cash, name_contract_dummies], axis=1)
#aggregating these over SK_ID_PREV as well
aggregated_cc_contract = pos_cash[['SK_ID_PREV'] + contract_names].groupby('SK_ID_PREV').mean()    

#merging with the final aggregations
pos_cash_aggregated = pos_cash_aggregated.merge(aggregated_cc_contract, on = 'SK_ID_PREV', how = 'outer')


In [None]:
columns_to_aggregate = pos_cash_aggregated.columns[1:]
#defining the aggregations to perform
aggregations_final = {}
for col in columns_to_aggregate:
    if 'MEAN' in col:
        aggregates = ['mean','sum','max']
    else:
        aggregates = ['mean']
    aggregations_final[col] = aggregates
pos_cash_aggregated_final = pos_cash_aggregated.groupby('SK_ID_CURR').agg(aggregations_final)
pos_cash_aggregated_final.columns = ['_'.join(ele).upper() for ele in pos_cash_aggregated_final.columns]


In [None]:
pos_cash_aggregated_final.reset_index(inplace = True)

In [None]:
pos_cash_aggregated_final.to_csv('pos_cash_final.csv', index = False)
print('done')