In [10]:
import pandas as pd
import numpy as np
import gc

In [2]:
prev = pd.read_csv('../input/previous_application.csv')

In [3]:
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
# Add feature: value ask / value received percentage
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']

In [4]:
def cate_cols():
    for col in prev.columns:
        if prev[col].dtype=='object':
            yield col

for col in cate_cols():
    print(col, '\t', prev[col].nunique())

NAME_CONTRACT_TYPE 	 4
WEEKDAY_APPR_PROCESS_START 	 7
FLAG_LAST_APPL_PER_CONTRACT 	 2
NAME_CASH_LOAN_PURPOSE 	 25
NAME_CONTRACT_STATUS 	 4
NAME_PAYMENT_TYPE 	 4
CODE_REJECT_REASON 	 9
NAME_TYPE_SUITE 	 7
NAME_CLIENT_TYPE 	 4
NAME_GOODS_CATEGORY 	 28
NAME_PORTFOLIO 	 5
NAME_PRODUCT_TYPE 	 3
CHANNEL_TYPE 	 8
NAME_SELLER_INDUSTRY 	 11
NAME_YIELD_GROUP 	 5
PRODUCT_COMBINATION 	 17


In [5]:
def one_hot_encoding(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [
        col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns,
                        dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [80]:
def extract_days_diff(df):
    ret = df[['SK_ID_CURR', 'SK_ID_PREV', 'DAYS_FIRST_DUE']].\
            groupby('SK_ID_CURR').\
            apply(
                lambda x:
                x.sort_values(['DAYS_FIRST_DUE'], ascending=False)
            ).\
            reset_index(drop=True)
    print(ret)
extract_days_diff(prev[:10000])

      SK_ID_CURR  SK_ID_PREV  DAYS_FIRST_DUE
0         100009     1413109          -418.0
1         100035     1339797             NaN
2         100043     1185699         -2380.0
3         100067     2359184             NaN
4         100077     2670402             NaN
5         100121     2117048         -1951.0
6         100124     1986247          -278.0
7         100144     1370544           -70.0
8         100151     1614986         -1011.0
9         100170     2274942             NaN
10        100201     2227969         -2393.0
11        100271     1550165          -920.0
12        100279     2034342             NaN
13        100281     1158695          -939.0
14        100293     1874254             NaN
15        100307     2685304             NaN
16        100373     1891648         -1013.0
17        100387     1189722           -80.0
18        100422     1115278             NaN
19        100425     1539382          -558.0
20        100478     1858047             NaN
21        

In [14]:
def previous_applications(num_rows=None, nan_as_category=True):
    prev = pd.read_csv('../input/previous_application.csv', nrows=num_rows)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

    # Add several simple features
    prev['NEW_APP_TO_CREDIT_RATIO'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    prev['NEW_ANNUITY_TO_CREDIT'] = prev['AMT_ANNUITY'] / prev['AMT_CREDIT']
    prev['NEW_DOWN_TO_CREDIT'] = prev['AMT_DOWN_PAYMENT'] / prev['AMT_CREDIT']
    prev['NEW_PRICE_TO_CREDIT'] = prev['AMT_GOODS_PRICE'] / prev['AMT_CREDIT']
    prev['NEW_LAST_DUE_SUB_FIRST'] = prev['DAYS_LAST_DUE'] - prev['DAYS_FIRST_DUE']
    
    def app_diversity_on_cate_cols(df, process_info):
        ret = df.groupby('SK_ID_CURR')['SK_ID_PREV'].count().\
                    reset_index().\
                    rename(index=str, columns={'SK_ID_PREV': 'NEW_USR_APP_CNT'})

        for col_name in process_info:
            new_col_name = 'NEW_N_UNIQUE_ON_' + col_name
            gby = df.groupby('SK_ID_CURR')[col_name].nunique().\
                    reset_index().\
                    rename(index=str, columns={col_name: new_col_name})
            ret = ret.merge(gby, on='SK_ID_CURR', how='left')
            ret['NEW_USR_APP_DIVERSITY_ON_' + col_name] = ret['NEW_USR_APP_CNT'] / ret[new_col_name] 

        return ret
    
    diversity_df = app_diversity_on_cate_cols(prev, [col for col in prev.columns if prev[col].dtype == 'object'])
    prev, cat_cols = one_hot_encoding(prev, nan_as_category=True)
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['max', 'mean'],
        'AMT_APPLICATION': ['max', 'mean'],
        'AMT_CREDIT': ['max', 'mean'],
        'AMT_DOWN_PAYMENT': ['max', 'mean'],
        'AMT_GOODS_PRICE': ['max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['max', 'mean'],
        
        'RATE_DOWN_PAYMENT':  ['max', 'mean'],
        'RATE_INTEREST_PRIMARY': ['max', 'mean'],
        'RATE_INTEREST_PRIVILEGED': ['max', 'mean'],
        'DAYS_DECISION': ['max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
        
        'DAYS_FIRST_DUE': ['min'],
        'DAYS_LAST_DUE': ['max'],
        
        'NEW_APP_TO_CREDIT_RATIO': ['mean'],
        'NEW_ANNUITY_TO_CREDIT': ['mean'],
        'NEW_DOWN_TO_CREDIT': ['max', 'mean'],
        'NEW_PRICE_TO_CREDIT': ['max', 'mean'],
        'NEW_LAST_DUE_SUB_FIRST': ['max', 'mean'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    prev_agg = prev.groupby('SK_ID_CURR').agg(
        {**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(
        ['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # add the diversity features
    prev_agg = prev_agg.merge(diversity_df, on='SK_ID_CURR', how='left')
    
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(
        ['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(
        ['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

prev_agg = previous_applications(num_rows=1000)

In [15]:
for col in prev_agg.columns:
    print(col)

SK_ID_CURR
PREV_AMT_ANNUITY_MAX
PREV_AMT_ANNUITY_MEAN
PREV_AMT_APPLICATION_MAX
PREV_AMT_APPLICATION_MEAN
PREV_AMT_CREDIT_MAX
PREV_AMT_CREDIT_MEAN
PREV_AMT_DOWN_PAYMENT_MAX
PREV_AMT_DOWN_PAYMENT_MEAN
PREV_AMT_GOODS_PRICE_MAX
PREV_AMT_GOODS_PRICE_MEAN
PREV_HOUR_APPR_PROCESS_START_MAX
PREV_HOUR_APPR_PROCESS_START_MEAN
PREV_RATE_DOWN_PAYMENT_MAX
PREV_RATE_DOWN_PAYMENT_MEAN
PREV_RATE_INTEREST_PRIMARY_MAX
PREV_RATE_INTEREST_PRIMARY_MEAN
PREV_RATE_INTEREST_PRIVILEGED_MAX
PREV_RATE_INTEREST_PRIVILEGED_MEAN
PREV_DAYS_DECISION_MAX
PREV_DAYS_DECISION_MEAN
PREV_CNT_PAYMENT_MEAN
PREV_CNT_PAYMENT_SUM
PREV_DAYS_FIRST_DUE_MIN
PREV_DAYS_LAST_DUE_MAX
PREV_NEW_APP_TO_CREDIT_RATIO_MEAN
PREV_NEW_ANNUITY_TO_CREDIT_MEAN
PREV_NEW_DOWN_TO_CREDIT_MAX
PREV_NEW_DOWN_TO_CREDIT_MEAN
PREV_NEW_PRICE_TO_CREDIT_MAX
PREV_NEW_PRICE_TO_CREDIT_MEAN
PREV_NEW_LAST_DUE_SUB_FIRST_MAX
PREV_NEW_LAST_DUE_SUB_FIRST_MEAN
PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN
PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN
PREV_NAME_CONTRACT_TYPE_Revo