In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Function to aggregate numeric columns 
def group(df_to_agg, prefix, aggregations, aggregate_by= 'SK_ID_CURR'):
    agg_df = df_to_agg.groupby(aggregate_by).agg(aggregations)
    agg_df.columns = pd.Index(['{}{}_{}'.format(prefix, e[0], e[1].upper())
                               for e in agg_df.columns.tolist()])
    return agg_df.reset_index()

# Function to merge numeric and categorical after aggregate
def group_and_merge(df_to_agg, df_to_merge, prefix, aggregations, aggregate_by= 'SK_ID_CURR'):
    agg_df = group(df_to_agg, prefix, aggregations, aggregate_by= aggregate_by)
    return df_to_merge.merge(agg_df, how='left', on= aggregate_by)

In [3]:
## One hot encoder

def one_hot_encoder(df, categorical_columns = None, nan_as_category = True):

    original_columns = list(df.columns)
    
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']

    # One hot encoder
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)

    # New columns created by get_dummies
    categorical_columns = [col for col in df.columns if col not in original_columns]

    return df, categorical_columns

In [3]:
pos = pd.read_csv('DATA/POS_CASH_balance.csv/POS_CASH_balance.csv')

In [46]:
pos, categorical_columns = one_hot_encoder(pos, nan_as_category= False)

In [10]:
categorical_columns

['NAME_CONTRACT_STATUS_Active',
 'NAME_CONTRACT_STATUS_Amortized debt',
 'NAME_CONTRACT_STATUS_Approved',
 'NAME_CONTRACT_STATUS_Canceled',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Demand',
 'NAME_CONTRACT_STATUS_Returned to the store',
 'NAME_CONTRACT_STATUS_Signed',
 'NAME_CONTRACT_STATUS_XNA']

In [47]:
# Flag months with late payment
pos['LATE_PAYMENT'] = pos['SK_DPD'].apply(lambda x: 1 if x > 0 else 0)

In [49]:
# aggregate by SK_ID_CURR
categorical_agg = {col: ['mean'] for col in categorical_columns}
pos_agg = group(pos, 'POS_', {**POS_CASH_AGG, **categorical_agg})

In [48]:
POS_CASH_AGG = {
    'SK_ID_PREV': ['nunique'],
    'MONTHS_BALANCE': ['mean', 'max', 'min', 'size'],
    'SK_DPD': ['mean', 'max', 'sum', 'var'],
    'SK_DPD_DEF': ['mean', 'max', 'sum'],
    'LATE_PAYMENT': ['mean', 'sum'],
}

In [16]:
pos_agg

Unnamed: 0,SK_ID_CURR,POS_SK_ID_PREV_NUNIQUE,POS_MONTHS_BALANCE_MEAN,POS_MONTHS_BALANCE_MAX,POS_MONTHS_BALANCE_MIN,POS_MONTHS_BALANCE_SIZE,POS_SK_DPD_MEAN,POS_SK_DPD_MAX,POS_SK_DPD_SUM,POS_SK_DPD_VAR,...,POS_LATE_PAYMENT_MEAN,POS_NAME_CONTRACT_STATUS_Active_MEAN,POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN,POS_NAME_CONTRACT_STATUS_Approved_MEAN,POS_NAME_CONTRACT_STATUS_Canceled_MEAN,POS_NAME_CONTRACT_STATUS_Completed_MEAN,POS_NAME_CONTRACT_STATUS_Demand_MEAN,POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN,POS_NAME_CONTRACT_STATUS_Signed_MEAN,POS_NAME_CONTRACT_STATUS_XNA_MEAN
0,100001,2,-72.555556,-53,-96,9,0.777778,7,7,5.444444,...,0.111111,0.777778,0.0,0.0,0.0,0.222222,0.0,0.0,0.000000,0.0
1,100002,1,-10.000000,-1,-19,19,0.000000,0,0,0.000000,...,0.000000,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,100003,3,-43.785714,-18,-77,28,0.000000,0,0,0.000000,...,0.000000,0.928571,0.0,0.0,0.0,0.071429,0.0,0.0,0.000000,0.0
3,100004,1,-25.500000,-24,-27,4,0.000000,0,0,0.000000,...,0.000000,0.750000,0.0,0.0,0.0,0.250000,0.0,0.0,0.000000,0.0
4,100005,1,-20.000000,-15,-25,11,0.000000,0,0,0.000000,...,0.000000,0.818182,0.0,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337247,456251,1,-5.000000,-1,-9,9,0.000000,0,0,0.000000,...,0.000000,0.777778,0.0,0.0,0.0,0.111111,0.0,0.0,0.111111,0.0
337248,456252,1,-79.000000,-76,-82,7,0.000000,0,0,0.000000,...,0.000000,0.857143,0.0,0.0,0.0,0.142857,0.0,0.0,0.000000,0.0
337249,456253,3,-79.235294,-57,-96,17,0.294118,5,5,1.470588,...,0.058824,0.882353,0.0,0.0,0.0,0.117647,0.0,0.0,0.000000,0.0
337250,456254,2,-5.550000,-1,-11,20,0.000000,0,0,0.000000,...,0.000000,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [18]:
sort_pos = pos.sort_values(by= ['SK_ID_PREV', 'MONTHS_BALANCE'])
sort_pos

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Amortized debt,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Returned to the store,NAME_CONTRACT_STATUS_Signed,NAME_CONTRACT_STATUS_XNA,LATE_PAYMENT
6030662,1000001,158271,-10,12.0,12.0,0,0,1,0,0,0,0,0,0,0,0,0
8470736,1000001,158271,-9,12.0,11.0,0,0,1,0,0,0,0,0,0,0,0,0
45995,1000001,158271,-8,2.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0
4467804,1000002,101962,-54,4.0,4.0,0,0,1,0,0,0,0,0,0,0,0,0
3346102,1000002,101962,-53,6.0,3.0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5675189,2843499,314148,-34,60.0,54.0,0,0,1,0,0,0,0,0,0,0,0,0
1722726,2843499,314148,-33,60.0,52.0,0,0,1,0,0,0,0,0,0,0,0,0
804539,2843499,314148,-32,60.0,51.0,0,0,1,0,0,0,0,0,0,0,0,0
5580339,2843499,314148,-31,10.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0


In [19]:
gb = sort_pos.groupby('SK_ID_PREV')

In [28]:
df = pd.DataFrame()

In [29]:
df['POS_LOAN_COMPLETED_MEAN'] = gb['NAME_CONTRACT_STATUS_Completed'].mean()

count    936325.000000
mean          0.098344
std           0.103144
min           0.000000
25%           0.000000
50%           0.083333
75%           0.142857
max           1.000000
Name: POS_LOAN_COMPLETED_MEAN, dtype: float64

In [36]:
df['POS_LOAN_COMPLETED_BEFORE_MEAN'] = gb['CNT_INSTALMENT'].first() - gb['CNT_INSTALMENT'].last()
df['POS_LOAN_COMPLETED_BEFORE_MEAN'] = df['POS_LOAN_COMPLETED_BEFORE_MEAN'].apply(lambda x: 1 if x > 0 else 0)

In [38]:
# Number of instalments left
df['POS_REMAINING_INSTALMENTS'] = gb['CNT_INSTALMENT_FUTURE'].last()
# Ratio between number of remaining instalments and total instalments
df['POS_FUTURE_TO_INSTALMENTS'] = gb['CNT_INSTALMENT_FUTURE'].last() / gb['CNT_INSTALMENT'].last()

In [40]:
df['SK_ID_CURR'] = gb['SK_ID_CURR'].first()
df

Unnamed: 0_level_0,POS_LOAN_COMPLETED_MEAN,POS_LOAN_COMPLETED_BEFORE_MEAN,POS_REMAINING_INSTALMENTS,POS_FUTURE_TO_INSTALMENTS,SK_ID_CURR
SK_ID_PREV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000001,0.333333,1,0.0,0.000000,158271
1000002,0.200000,0,0.0,0.000000,101962
1000003,0.000000,0,9.0,0.750000,252457
1000004,0.125000,1,0.0,0.000000,260094
1000005,0.090909,0,0.0,0.000000,176456
...,...,...,...,...,...
2843494,0.333333,1,0.0,0.000000,292375
2843495,0.125000,1,0.0,0.000000,260963
2843497,0.000000,0,4.0,0.166667,451578
2843498,0.142857,1,0.0,0.000000,393881


In [51]:
df_gb = df.groupby('SK_ID_CURR').sum().reset_index()
pos_agg = pos_agg.merge(df_gb, on= 'SK_ID_CURR', how= 'left')

In [53]:
pos_agg.drop(columns= ['POS_NAME_CONTRACT_STATUS_XNA_MEAN'], inplace= True)

In [54]:
pos_agg.to_parquet('TO_TRAIN/pos.gzip', compression= 'gzip', index= False)