In [1]:
import pandas as pd
import numpy as np

In [2]:
credit = pd.read_csv('DATA/credit_card_balance.csv/credit_card_balance.csv')

In [5]:
credit, categorical_columns = one_hot_encoder(credit, nan_as_category= False)

In [8]:
categorical_columns

['NAME_CONTRACT_STATUS_Active',
 'NAME_CONTRACT_STATUS_Approved',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Demand',
 'NAME_CONTRACT_STATUS_Refused',
 'NAME_CONTRACT_STATUS_Sent proposal',
 'NAME_CONTRACT_STATUS_Signed']

Ratio

In [6]:
credit['BALANCE_TO_LIMIT_RATIO'] = credit['AMT_BALANCE'] / credit['AMT_CREDIT_LIMIT_ACTUAL']
credit['PAYMENT_CURRENT_TO_MIN_REGULAR_RATIO'] = credit['AMT_PAYMENT_CURRENT'] / credit['AMT_INST_MIN_REGULARITY']
credit['DRAWING_LIMIT_RATIO'] = credit['AMT_DRAWINGS_ATM_CURRENT'] / credit['AMT_CREDIT_LIMIT_ACTUAL']

In [7]:
credit['LATE_PAYMENT'] = credit['SK_DPD'].apply(lambda x: 1 if x > 0 else 0)

In [14]:
categorical_agg = {col: ['mean'] for col in categorical_columns}
credit_agg = group(credit, 'CREDIT_', {** CREDIT_CARD_AGG, ** categorical_agg})

In [13]:
CREDIT_CARD_AGG = {
    'SK_ID_PREV': ['nunique'],
    'MONTHS_BALANCE': ['min'],
    'AMT_BALANCE': ['max'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['max'],
    'AMT_DRAWINGS_ATM_CURRENT': ['max', 'sum'],
    'AMT_DRAWINGS_CURRENT': ['max', 'sum'],
    'AMT_DRAWINGS_POS_CURRENT': ['max', 'sum'],
    'AMT_INST_MIN_REGULARITY': ['max', 'mean'],
    'AMT_PAYMENT_TOTAL_CURRENT': ['max', 'mean', 'sum', 'var'],
    'AMT_TOTAL_RECEIVABLE': ['max', 'mean'],
    'CNT_DRAWINGS_ATM_CURRENT': ['max', 'mean', 'sum'],
    'CNT_DRAWINGS_CURRENT': ['max', 'mean', 'sum'],
    'CNT_DRAWINGS_POS_CURRENT': ['mean'],
    'SK_DPD': ['mean', 'max', 'sum'],
    'SK_DPD_DEF': ['max', 'sum'],
    'BALANCE_TO_LIMIT_RATIO': ['max', 'mean'],
    'PAYMENT_CURRENT_TO_MIN_REGULAR_RATIO': ['min', 'mean'],
    'LATE_PAYMENT': ['max', 'sum'],
}

In [22]:
## Aggregate for last x months
for months in [12, 24, 36, 48]:
    credit_prev_ids = credit[credit['MONTHS_BALANCE'] >= -months]['SK_ID_PREV'].unique()
    last_x_month_prev = credit[credit['SK_ID_PREV'].isin(credit_prev_ids)]
    prefix = 'CREDIT_{}M_'.format(months)
    credit_agg = group_and_merge(last_x_month_prev, credit_agg, prefix, CREDIT_CARD_TIME_AGG)

In [21]:
CREDIT_CARD_TIME_AGG = {
    'CNT_DRAWINGS_ATM_CURRENT': ['mean'],
    'SK_DPD': ['max', 'sum'],
    'AMT_BALANCE': ['mean', 'max'],
    'BALANCE_TO_LIMIT_RATIO': ['max', 'mean'],
}

In [3]:
## One hot encoder

def one_hot_encoder(df, nan_as_category = True):

    original_columns = list(df.columns)
    
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']

    # One hot encoder
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)

    # New columns created by get_dummies
    categorical_columns = [col for col in df.columns if col not in original_columns]

    return df, categorical_columns

In [24]:
credit_agg.to_parquet('TO_TRAIN/credit.gzip', compression= 'gzip', index= False)

In [4]:
# Function to aggregate numeric columns 
def group(df_to_agg, prefix, aggregations, aggregate_by= 'SK_ID_CURR'):
    agg_df = df_to_agg.groupby(aggregate_by).agg(aggregations)
    agg_df.columns = pd.Index(['{}{}_{}'.format(prefix, e[0], e[1].upper())
                               for e in agg_df.columns.tolist()])
    return agg_df.reset_index()

# Function to merge numeric and categorical after aggregate
def group_and_merge(df_to_agg, df_to_merge, prefix, aggregations, aggregate_by= 'SK_ID_CURR'):
    agg_df = group(df_to_agg, prefix, aggregations, aggregate_by= aggregate_by)
    return df_to_merge.merge(agg_df, how='left', on= aggregate_by)