# 1. SETTINGS

In [103]:
# libraries
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [104]:
# pandas options
pd.set_option("display.max_columns", None)

In [105]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [106]:
# garbage collection
import gc
gc.enable()

# 2. FUNCTIONS

In [107]:
##### FUNCTION FOR COUNTING MISSINGS
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending = False)
    table = pd.concat([total, percent], axis = 1, keys = ["Total", "Percent"])
    table = table[table["Total"] > 0]
    return table

In [108]:
##### FUNCTION FOR AGGREGATING DATA
def aggregate_data(data, id_var, label = None):
    
    
    ### SEPARATE FEATURES
  
    # display info
    print("- Preparing the dataset...")

    # find factors
    data_factors = [f for f in data.columns if data[f].dtype == "object"]
    
    # partition subsets
    num_data = data[list(set(data.columns) - set(data_factors))]
    fac_data = data[[id_var] + data_factors]
    
    # display info
    num_facs = fac_data.shape[1] - 1
    num_nums = num_data.shape[1] - 1
    print("- Extracted %.0f factors and %.0f numerics..." % (num_facs, num_nums))


    ##### AGGREGATION
 
    # aggregate numerics
    if (num_nums > 0):
        print("- Aggregating numeric features...")
        num_data = num_data.groupby(id_var).agg(["mean", "std", "min", "max"])
        num_data.columns = ["_".join(col).strip() for col in num_data.columns.values]
        num_data = num_data.sort_index()

    # aggregate factors
    if (num_facs > 0):
        print("- Aggregating factor features...")
        fac_data = fac_data.groupby(id_var).agg([("mode",   lambda x: scipy.stats.mode(x)[0][0]),
                                                 ("unique", lambda x: x.nunique())])
        fac_data.columns = ["_".join(col).strip() for col in fac_data.columns.values]
        fac_data = fac_data.sort_index()


    ##### MERGER

    # merge numerics and factors
    if ((num_facs > 0) & (num_nums > 0)):
        agg_data = pd.concat([num_data, fac_data], axis = 1)
    
    # use factors only
    if ((num_facs > 0) & (num_nums == 0)):
        agg_data = fac_data
        
    # use numerics only
    if ((num_facs == 0) & (num_nums > 0)):
        agg_data = num_data
        

    ##### LAST STEPS

    # update labels
    if label != None:
        agg_data.columns = [label + "_" + str(col) for col in agg_data.columns]
    
    # impute zeros for SD
    #stdevs = agg_data.filter(like = "_std").columns
    #for var in stdevs:
    #    agg_data[var].fillna(0, inplace = True)

    # display info
    print("- Final dimensions:", agg_data.shape)
    
    # return dataset
    return agg_data

# 3. DATA IMPORT

In [109]:
# import data
train = pd.read_csv("../data/raw/application_train.csv")
test  = pd.read_csv("../data/raw/application_test.csv")
buro  = pd.read_csv("../data/raw/bureau.csv")
bbal  = pd.read_csv("../data/raw/bureau_balance.csv")
prev  = pd.read_csv("../data/raw/previous_application.csv")
card  = pd.read_csv("../data/raw/credit_card_balance.csv")
poca  = pd.read_csv("../data/raw/POS_CASH_balance.csv")
inst  = pd.read_csv("../data/raw/installments_payments.csv")

In [110]:
# check dimensions
print("Application:", train.shape, test.shape)
print("Buro:", buro.shape)
print("Bbal:", bbal.shape)
print("Prev:", prev.shape)
print("Card:", card.shape)
print("Poca:", poca.shape)
print("Inst:", inst.shape)

Application: (307511, 122) (48744, 121)
Buro: (1716428, 17)
Bbal: (27299925, 3)
Prev: (1670214, 37)
Card: (3840312, 23)
Poca: (10001358, 8)
Inst: (13605401, 8)


In [111]:
# extract target
y = train[["SK_ID_CURR", "TARGET"]]
del train["TARGET"]

In [112]:
### CHECK LOAN ID DISTRIBUTION

# check unique IDs
print("IDs in TRAIN:", train.SK_ID_CURR.nunique())
print("IDs in TEST:",  test.SK_ID_CURR.nunique())
print("IDs in BURO:",  buro.SK_ID_CURR.nunique())
print("IDs in PREV:",  prev.SK_ID_CURR.nunique())
print("IDs in CARD:",  card.SK_ID_CURR.nunique())
print("IDs in POCA:",  poca.SK_ID_CURR.nunique())
print("IDs in INST:",  inst.SK_ID_CURR.nunique())
print("")

# check current loan differences
print("IDs in TRAIN but not in BURO:", 
      len(list(set(train.SK_ID_CURR.unique()) - set(buro.SK_ID_CURR.unique()))))
print("IDs in TRAIN but not in PREV:", 
      len(list(set(train.SK_ID_CURR.unique()) - set(prev.SK_ID_CURR.unique()))))
print("IDs in TRAIN but not in CARD:", 
      len(list(set(train.SK_ID_CURR.unique()) - set(card.SK_ID_CURR.unique()))))
print("IDs in TRAIN but not in POCA:", 
      len(list(set(train.SK_ID_CURR.unique()) - set(poca.SK_ID_CURR.unique()))))
print("IDs in TRAIN but not in INST:", 
      len(list(set(train.SK_ID_CURR.unique()) - set(poca.SK_ID_CURR.unique()))))
print("")

# check current loan differences
print("IDs in TEST but not in BURO:", 
      len(list(set(test.SK_ID_CURR.unique()) - set(buro.SK_ID_CURR.unique()))))
print("IDs in TEST but not in PREV:", 
      len(list(set(test.SK_ID_CURR.unique()) - set(prev.SK_ID_CURR.unique()))))
print("IDs in TEST but not in CARD:", 
      len(list(set(test.SK_ID_CURR.unique()) - set(card.SK_ID_CURR.unique()))))
print("IDs in TEST but not in POCA:", 
      len(list(set(test.SK_ID_CURR.unique()) - set(poca.SK_ID_CURR.unique()))))
print("IDs in TEST but not in INST:", 
      len(list(set(test.SK_ID_CURR.unique()) - set(poca.SK_ID_CURR.unique()))))
print("")

print("IDs in TRAIN and BURO:", 
       len(set.intersection(set(train.SK_ID_CURR.unique()), set(buro.SK_ID_CURR.unique()))))
print("IDs in TRAIN and PREV:", 
       len(set.intersection(set(train.SK_ID_CURR.unique()), set(prev.SK_ID_CURR.unique()))))
print("IDs in TRAIN and CARD:", 
       len(set.intersection(set(train.SK_ID_CURR.unique()), set(card.SK_ID_CURR.unique())))) 
print("IDs in TRAIN and POCA:", 
       len(set.intersection(set(train.SK_ID_CURR.unique()), set(poca.SK_ID_CURR.unique())))) 
print("IDs in TRAIN and INST:", 
       len(set.intersection(set(train.SK_ID_CURR.unique()), set(inst.SK_ID_CURR.unique())))) 
print("IDs in TRAIN and BURO and PREV:", 
       len(set.intersection(set(train.SK_ID_CURR.unique()), set(buro.SK_ID_CURR.unique()), 
                            set(prev.SK_ID_CURR.unique()))))
print("IDs in TRAIN and PREV and POCA and INST:", 
       len(set.intersection(set(train.SK_ID_CURR.unique()), set(prev.SK_ID_CURR.unique()), 
                            set(poca.SK_ID_CURR.unique()), set(inst.SK_ID_CURR.unique()))))
print("")

print("IDs in TEST and BURO:", 
       len(set.intersection(set(test.SK_ID_CURR.unique()), set(buro.SK_ID_CURR.unique()))))
print("IDs in TEST and PREV:", 
       len(set.intersection(set(test.SK_ID_CURR.unique()), set(prev.SK_ID_CURR.unique()))))
print("IDs in TEST and CARD:", 
       len(set.intersection(set(test.SK_ID_CURR.unique()), set(card.SK_ID_CURR.unique())))) 
print("IDs in TEST and POCA:", 
       len(set.intersection(set(test.SK_ID_CURR.unique()), set(poca.SK_ID_CURR.unique())))) 
print("IDs in TEST and INST:", 
       len(set.intersection(set(test.SK_ID_CURR.unique()), set(inst.SK_ID_CURR.unique())))) 
print("IDs in TEST and BURO and PREV:", 
       len(set.intersection(set(test.SK_ID_CURR.unique()), set(buro.SK_ID_CURR.unique()), 
                            set(prev.SK_ID_CURR.unique()))))
print("IDs in TEST and PREV and POCA and INST:", 
       len(set.intersection(set(test.SK_ID_CURR.unique()), set(prev.SK_ID_CURR.unique()), 
                            set(poca.SK_ID_CURR.unique()), set(inst.SK_ID_CURR.unique()))))

IDs in TRAIN: 307511
IDs in TEST: 48744
IDs in BURO: 305811
IDs in PREV: 338857
IDs in CARD: 103558
IDs in POCA: 337252
IDs in INST: 339587

IDs in TRAIN but not in BURO: 44020
IDs in TRAIN but not in PREV: 16454
IDs in TRAIN but not in CARD: 220606
IDs in TRAIN but not in POCA: 18067
IDs in TRAIN but not in INST: 18067

IDs in TEST but not in BURO: 6424
IDs in TEST but not in PREV: 944
IDs in TEST but not in CARD: 32091
IDs in TEST but not in POCA: 936
IDs in TEST but not in INST: 936

IDs in TRAIN and BURO: 263491
IDs in TRAIN and PREV: 291057
IDs in TRAIN and CARD: 86905
IDs in TRAIN and POCA: 289444
IDs in TRAIN and INST: 291643
IDs in TRAIN and BURO and PREV: 249507
IDs in TRAIN and PREV and POCA and INST: 288028

IDs in TEST and BURO: 42320
IDs in TEST and PREV: 47800
IDs in TEST and CARD: 16653
IDs in TEST and POCA: 47808
IDs in TEST and INST: 47944
IDs in TEST and BURO and PREV: 41584
IDs in TEST and PREV and POCA and INST: 47537


# 4. PREPROCESSING

## 4.1. APPLICATION DATA

In [113]:
# concatenate application data
appl = pd.concat([train, test])
del train, test

In [114]:
### FEATURE ENGINEERING

# credit compared to income
appl["CREDIT_BY_INCOME"] = appl["AMT_CREDIT"] / appl["AMT_INCOME_TOTAL"]

# logarithms
appl["AMT_CREDIT"]      = np.log(1 + appl["AMT_CREDIT"])
appl["AMT_INCOME"]      = np.log(1 + appl["AMT_INCOME_TOTAL"])
appl["AMT_GOODS_PRICE"] = np.log(1 + appl["AMT_GOODS_PRICE"])
appl["AMT_ANNUITY"]     = np.log(1 + appl["AMT_ANNUITY"])

# number of external sources
appl["NUM_EXT_SOURCES"] = 3 - (appl["EXT_SOURCE_1"].isnull().astype(int) +
                               appl["EXT_SOURCE_2"].isnull().astype(int) +
                               appl["EXT_SOURCE_3"].isnull().astype(int))

In [115]:
# rename features
appl.columns = ["SK_ID_CURR"] + ["app_" + str(col) for col in appl.columns if col not in "SK_ID_CURR"]

In [116]:
# check data
appl.head()

Unnamed: 0,SK_ID_CURR,app_NAME_CONTRACT_TYPE,app_CODE_GENDER,app_FLAG_OWN_CAR,app_FLAG_OWN_REALTY,app_CNT_CHILDREN,app_AMT_INCOME_TOTAL,app_AMT_CREDIT,app_AMT_ANNUITY,app_AMT_GOODS_PRICE,app_NAME_TYPE_SUITE,app_NAME_INCOME_TYPE,app_NAME_EDUCATION_TYPE,app_NAME_FAMILY_STATUS,app_NAME_HOUSING_TYPE,app_REGION_POPULATION_RELATIVE,app_DAYS_BIRTH,app_DAYS_EMPLOYED,app_DAYS_REGISTRATION,app_DAYS_ID_PUBLISH,app_OWN_CAR_AGE,app_FLAG_MOBIL,app_FLAG_EMP_PHONE,app_FLAG_WORK_PHONE,app_FLAG_CONT_MOBILE,app_FLAG_PHONE,app_FLAG_EMAIL,app_OCCUPATION_TYPE,app_CNT_FAM_MEMBERS,app_REGION_RATING_CLIENT,app_REGION_RATING_CLIENT_W_CITY,app_WEEKDAY_APPR_PROCESS_START,app_HOUR_APPR_PROCESS_START,app_REG_REGION_NOT_LIVE_REGION,app_REG_REGION_NOT_WORK_REGION,app_LIVE_REGION_NOT_WORK_REGION,app_REG_CITY_NOT_LIVE_CITY,app_REG_CITY_NOT_WORK_CITY,app_LIVE_CITY_NOT_WORK_CITY,app_ORGANIZATION_TYPE,app_EXT_SOURCE_1,app_EXT_SOURCE_2,app_EXT_SOURCE_3,app_APARTMENTS_AVG,app_BASEMENTAREA_AVG,app_YEARS_BEGINEXPLUATATION_AVG,app_YEARS_BUILD_AVG,app_COMMONAREA_AVG,app_ELEVATORS_AVG,app_ENTRANCES_AVG,app_FLOORSMAX_AVG,app_FLOORSMIN_AVG,app_LANDAREA_AVG,app_LIVINGAPARTMENTS_AVG,app_LIVINGAREA_AVG,app_NONLIVINGAPARTMENTS_AVG,app_NONLIVINGAREA_AVG,app_APARTMENTS_MODE,app_BASEMENTAREA_MODE,app_YEARS_BEGINEXPLUATATION_MODE,app_YEARS_BUILD_MODE,app_COMMONAREA_MODE,app_ELEVATORS_MODE,app_ENTRANCES_MODE,app_FLOORSMAX_MODE,app_FLOORSMIN_MODE,app_LANDAREA_MODE,app_LIVINGAPARTMENTS_MODE,app_LIVINGAREA_MODE,app_NONLIVINGAPARTMENTS_MODE,app_NONLIVINGAREA_MODE,app_APARTMENTS_MEDI,app_BASEMENTAREA_MEDI,app_YEARS_BEGINEXPLUATATION_MEDI,app_YEARS_BUILD_MEDI,app_COMMONAREA_MEDI,app_ELEVATORS_MEDI,app_ENTRANCES_MEDI,app_FLOORSMAX_MEDI,app_FLOORSMIN_MEDI,app_LANDAREA_MEDI,app_LIVINGAPARTMENTS_MEDI,app_LIVINGAREA_MEDI,app_NONLIVINGAPARTMENTS_MEDI,app_NONLIVINGAREA_MEDI,app_FONDKAPREMONT_MODE,app_HOUSETYPE_MODE,app_TOTALAREA_MODE,app_WALLSMATERIAL_MODE,app_EMERGENCYSTATE_MODE,app_OBS_30_CNT_SOCIAL_CIRCLE,app_DEF_30_CNT_SOCIAL_CIRCLE,app_OBS_60_CNT_SOCIAL_CIRCLE,app_DEF_60_CNT_SOCIAL_CIRCLE,app_DAYS_LAST_PHONE_CHANGE,app_FLAG_DOCUMENT_2,app_FLAG_DOCUMENT_3,app_FLAG_DOCUMENT_4,app_FLAG_DOCUMENT_5,app_FLAG_DOCUMENT_6,app_FLAG_DOCUMENT_7,app_FLAG_DOCUMENT_8,app_FLAG_DOCUMENT_9,app_FLAG_DOCUMENT_10,app_FLAG_DOCUMENT_11,app_FLAG_DOCUMENT_12,app_FLAG_DOCUMENT_13,app_FLAG_DOCUMENT_14,app_FLAG_DOCUMENT_15,app_FLAG_DOCUMENT_16,app_FLAG_DOCUMENT_17,app_FLAG_DOCUMENT_18,app_FLAG_DOCUMENT_19,app_FLAG_DOCUMENT_20,app_FLAG_DOCUMENT_21,app_AMT_REQ_CREDIT_BUREAU_HOUR,app_AMT_REQ_CREDIT_BUREAU_DAY,app_AMT_REQ_CREDIT_BUREAU_WEEK,app_AMT_REQ_CREDIT_BUREAU_MON,app_AMT_REQ_CREDIT_BUREAU_QRT,app_AMT_REQ_CREDIT_BUREAU_YEAR,app_CREDIT_BY_INCOME,app_AMT_INCOME,app_NUM_EXT_SOURCES
0,100002,Cash loans,M,N,Y,0,202500.0,12.915581,10.114619,12.768544,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,2.007889,12.2185,3
1,100003,Cash loans,F,N,N,0,270000.0,14.072865,10.482892,13.937287,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,4.79075,12.506181,2
2,100004,Revolving loans,M,Y,Y,0,67500.0,11.813037,8.817446,11.813037,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,11.119898,2
3,100006,Cash loans,F,N,Y,0,135000.0,12.652947,10.298481,12.601491,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,2.316167,11.813037,1
4,100007,Cash loans,M,N,Y,0,121500.0,13.148033,9.992711,13.148033,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,4.222222,11.707678,1


In [117]:
# count missings
nas = missing_data(appl)
nas

Unnamed: 0,Total,Percent
app_COMMONAREA_MODE,248360,69.714109
app_COMMONAREA_AVG,248360,69.714109
app_COMMONAREA_MEDI,248360,69.714109
app_NONLIVINGAPARTMENTS_AVG,246861,69.293343
app_NONLIVINGAPARTMENTS_MODE,246861,69.293343
app_NONLIVINGAPARTMENTS_MEDI,246861,69.293343
app_FONDKAPREMONT_MODE,243092,68.235393
app_LIVINGAPARTMENTS_MODE,242979,68.203674
app_LIVINGAPARTMENTS_AVG,242979,68.203674
app_LIVINGAPARTMENTS_MEDI,242979,68.203674


In [118]:
##### IMPUTE MISSINGS

### APARTMENT DATA

# find variables
living_nums = ["app_COMMONAREA_MODE", "app_COMMONAREA_AVG", "app_COMMONAREA_MEDI",
               "app_NONLIVINGAPARTMENTS_AVG", "app_NONLIVINGAPARTMENTS_MODE",
               "app_NONLIVINGAPARTMENTS_MEDI",
               "app_LIVINGAPARTMENTS_MODE", "app_LIVINGAPARTMENTS_AVG",
               "app_LIVINGAPARTMENTS_MEDI", "app_FLOORSMIN_MODE", "app_FLOORSMIN_AVG",
               "app_FLOORSMIN_MEDI", "app_YEARS_BUILD_AVG", "app_YEARS_BUILD_MEDI",
               "app_YEARS_BUILD_MODE", "app_LANDAREA_AVG",
               "app_LANDAREA_MEDI", "app_LANDAREA_MODE", "app_BASEMENTAREA_MEDI",
               "app_BASEMENTAREA_MODE", "app_BASEMENTAREA_AVG",
               "app_NONLIVINGAREA_MODE", "app_NONLIVINGAREA_AVG",
               "app_NONLIVINGAREA_MEDI","app_ELEVATORS_MODE",
               "app_ELEVATORS_AVG", "app_ELEVATORS_MEDI",
               "app_APARTMENTS_MEDI", "app_APARTMENTS_MODE", "app_APARTMENTS_AVG",
               "app_ENTRANCES_AVG", "app_ENTRANCES_MODE", "app_ENTRANCES_MEDI",
               "app_LIVINGAREA_MEDI", "app_LIVINGAREA_AVG",
               "app_LIVINGAREA_MODE", "app_FLOORSMAX_AVG", "app_FLOORSMAX_MODE",
               "app_FLOORSMAX_MEDI", "app_YEARS_BEGINEXPLUATATION_MEDI",
               "app_YEARS_BEGINEXPLUATATION_AVG", "app_YEARS_BEGINEXPLUATATION_MODE",
               "app_TOTALAREA_MODE"]
living_facs = ["app_FONDKAPREMONT_MODE", "app_WALLSMATERIAL_MODE", "app_HOUSETYPE_MODE", "app_EMERGENCYSTATE_MODE"]
living_vars = living_nums + living_facs

# dummy indicator
appl["app_isnull_HOUSE"] = (appl[living_vars].isnull().sum(axis = 1) == 0).astype(int)

# impute numerics
for var in living_nums:
    appl[var].fillna(-99, inplace = True)
    
# impute factors
for var in living_facs:
    appl[var].fillna("Unknown level", inplace = True)
    
    
### OWN CAR
appl["app_OWN_CAR_AGE"][appl["app_FLAG_OWN_CAR"] == "Y"].fillna(appl["app_OWN_CAR_AGE"].median(), inplace = True)
appl["app_OWN_CAR_AGE"].fillna(-9, inplace = True)


### EXTERNAL SCORES
for var in ["app_EXT_SOURCE_1", "app_EXT_SOURCE_2", "app_EXT_SOURCE_3"]:
    appl[var].fillna(-9, inplace = True)
    
    
### OCCUPATION
appl["app_OCCUPATION_TYPE"][appl["app_NAME_INCOME_TYPE"] == "Pensioner"].fillna("Retired", inplace = True)
appl["app_OCCUPATION_TYPE"][appl["app_NAME_INCOME_TYPE"] == "Student"].fillna("Student",   inplace = True)
appl["app_OCCUPATION_TYPE"].fillna("Unknown level", inplace = True)


### BURO ENQUIRIES

# find variables
buro_vars = ["app_AMT_REQ_CREDIT_BUREAU_YEAR", "app_AMT_REQ_CREDIT_BUREAU_QRT", 
             "app_AMT_REQ_CREDIT_BUREAU_MON",  "app_AMT_REQ_CREDIT_BUREAU_WEEK", 
             "app_AMT_REQ_CREDIT_BUREAU_DAY",  "app_AMT_REQ_CREDIT_BUREAU_HOUR"]

# dummy indicator
appl["app_isnull_BURO_ENQUIRIES"] = (appl[buro_vars].isnull().sum(axis = 1) == 0).astype(int)

# impute numerics
for var in buro_vars:
    appl[var].fillna(-99, inplace = True)
    
    
### COMPANY
appl["app_NAME_TYPE_SUITE"].fillna("Unknown level", inplace = True)


### SOCIAL CIRCLE

# find variables
social_vars = ["app_OBS_30_CNT_SOCIAL_CIRCLE", "app_DEF_30_CNT_SOCIAL_CIRCLE", "app_OBS_60_CNT_SOCIAL_CIRCLE", 
               "app_DEF_60_CNT_SOCIAL_CIRCLE", "app_AMT_GOODS_PRICE"]

# dummy indicator
appl["app_isnull_SOCIAL_CIRCLE"] = (appl[social_vars].isnull().sum(axis = 1) == 0).astype(int)

# impute numerics
for var in social_vars:
    appl[var].fillna(-99, inplace = True)
             
             
### OTHER FEATURES
             
# find variables
other_vars = ["app_AMT_GOODS_PRICE", "app_AMT_ANNUITY", "app_CNT_FAM_MEMBERS", "app_DAYS_LAST_PHONE_CHANGE"]
for var in other_vars:
    appl[var].fillna(-99, inplace = True)           

In [119]:
# count missings
nas = missing_data(appl)
nas

Unnamed: 0,Total,Percent


## 4.2. CREDIT BUREAU DATA

### 4.2.1. CHECKS

In [120]:
# check buro data
buro.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [121]:
# check bbal data
bbal.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


### 4.2.2. BBAL DATA

In [122]:
### FEATURE ENGINEERING

# dummy encoding for STATUS
bbal = pd.get_dummies(bbal, columns = ["STATUS"], prefix = "STATUS")

In [123]:
# count missings
nas = missing_data(bbal)
nas

Unnamed: 0,Total,Percent


In [124]:
### AGGREGATIONS

# total month count
cnt_mon = bbal[["SK_ID_BUREAU", "MONTHS_BALANCE"]].groupby("SK_ID_BUREAU").count()
del bbal["MONTHS_BALANCE"]

# aggregate data
agg_bbal = bbal.groupby("SK_ID_BUREAU").mean()

# add total month count
agg_bbal["MONTH_COUNT"] = cnt_mon

In [125]:
# count missings
nas = missing_data(agg_bbal)
nas

Unnamed: 0,Total,Percent


In [126]:
# check data
agg_bbal.head()

Unnamed: 0_level_0,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X,MONTH_COUNT
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5001709,0.0,0.0,0.0,0.0,0.0,0.0,0.886598,0.113402,97
5001710,0.060241,0.0,0.0,0.0,0.0,0.0,0.578313,0.361446,83
5001711,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.25,4
5001712,0.526316,0.0,0.0,0.0,0.0,0.0,0.473684,0.0,19
5001713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22


In [127]:
# clear memory
del bbal

### 4.2.3. BURO DATA

In [128]:
### MERGE
buro = buro.merge(right = agg_bbal.reset_index(), how = "left", on = "SK_ID_BUREAU")

In [129]:
### FEATURE ENGINEERING

# logarithms
buro["AMT_CREDIT_SUM"]         = np.log(1 + buro["AMT_CREDIT_SUM"])
buro["AMT_CREDIT_SUM_DEBT"]    = np.log(1 + buro["AMT_CREDIT_SUM_DEBT"])
buro["AMT_CREDIT_SUM_LIMIT"]   = np.log(1 + buro["AMT_CREDIT_SUM_LIMIT"])
buro["AMT_CREDIT_SUM_OVERDUE"] = np.log(1 + buro["AMT_CREDIT_SUM_OVERDUE"])
buro["AMT_CREDIT_MAX_OVERDUE"] = np.log(1 + buro["AMT_CREDIT_MAX_OVERDUE"])
buro["AMT_ANNUITY"]            = np.log(1 + buro["AMT_ANNUITY"])

In [130]:
# count missings
nas = missing_data(buro)
nas

Unnamed: 0,Total,Percent
AMT_ANNUITY,1226791,71.47349
AMT_CREDIT_MAX_OVERDUE,1124488,65.513264
MONTH_COUNT,942074,54.885728
STATUS_X,942074,54.885728
STATUS_C,942074,54.885728
STATUS_5,942074,54.885728
STATUS_4,942074,54.885728
STATUS_3,942074,54.885728
STATUS_2,942074,54.885728
STATUS_1,942074,54.885728


In [131]:
##### IMPUTE MISSINGS

### STATUS

# find variables
stats_vars = ["STATUS_0", "STATUS_1", "STATUS_2", "STATUS_3", 
              "STATUS_4", "STATUS_5", "STATUS_C", "STATUS_X", "MONTH_COUNT"]

# dummy indicator
buro["isnull_STATUS"] = (buro[stats_vars].isnull().sum(axis = 1) == 0).astype(int)

# impute stats_vars
for var in stats_vars:
    buro[var].fillna((buro[var].median()), inplace = True)
         
        
### AMOUNTS
amnts = ["AMT_ANNUITY", "AMT_CREDIT_MAX_OVERDUE"] 
for var in amnts:
    buro["isnull_" + var] = buro[var].isnull() + 0

             
### OTHER FEATURES
             
# find variables
other_vars = ["DAYS_ENDDATE_FACT", "AMT_CREDIT_SUM_LIMIT", "AMT_CREDIT_SUM_DEBT", 
              "DAYS_CREDIT_ENDDATE", "AMT_CREDIT_SUM"]
for var in other_vars:
    buro[var].fillna((buro[var].median()), inplace = True)

In [132]:
### AGGREGATIONS

# count previous buro loans
cnt_buro = buro[["SK_ID_CURR", "SK_ID_BUREAU"]].groupby("SK_ID_CURR").count()
del buro["SK_ID_BUREAU"]

# aggregate data
agg_buro = aggregate_data(buro, id_var = "SK_ID_CURR", label = "buro")

# add buro loan count
agg_buro["buro_BURO_COUNT"] = cnt_buro

- Preparing the dataset...
- Extracted 3 factors and 24 numerics...
- Aggregating numeric features...
- Aggregating factor features...
- Final dimensions: (305811, 102)


In [133]:
# count missings
nas = missing_data(agg_buro)
nas

Unnamed: 0,Total,Percent
buro_AMT_ANNUITY_std,213412,69.785587
buro_AMT_ANNUITY_max,187587,61.340828
buro_AMT_ANNUITY_min,187587,61.340828
buro_AMT_ANNUITY_mean,187587,61.340828
buro_AMT_CREDIT_MAX_OVERDUE_std,169242,55.342025
buro_AMT_CREDIT_MAX_OVERDUE_max,92840,30.35862
buro_AMT_CREDIT_MAX_OVERDUE_min,92840,30.35862
buro_AMT_CREDIT_MAX_OVERDUE_mean,92840,30.35862
buro_DAYS_CREDIT_ENDDATE_std,41520,13.577013
buro_DAYS_CREDIT_UPDATE_std,41520,13.577013


In [134]:
# check data
agg_buro.head()

Unnamed: 0_level_0,buro_AMT_CREDIT_SUM_DEBT_mean,buro_AMT_CREDIT_SUM_DEBT_std,buro_AMT_CREDIT_SUM_DEBT_min,buro_AMT_CREDIT_SUM_DEBT_max,buro_isnull_AMT_ANNUITY_mean,buro_isnull_AMT_ANNUITY_std,buro_isnull_AMT_ANNUITY_min,buro_isnull_AMT_ANNUITY_max,buro_CNT_CREDIT_PROLONG_mean,buro_CNT_CREDIT_PROLONG_std,buro_CNT_CREDIT_PROLONG_min,buro_CNT_CREDIT_PROLONG_max,buro_AMT_CREDIT_MAX_OVERDUE_mean,buro_AMT_CREDIT_MAX_OVERDUE_std,buro_AMT_CREDIT_MAX_OVERDUE_min,buro_AMT_CREDIT_MAX_OVERDUE_max,buro_STATUS_3_mean,buro_STATUS_3_std,buro_STATUS_3_min,buro_STATUS_3_max,buro_STATUS_C_mean,buro_STATUS_C_std,buro_STATUS_C_min,buro_STATUS_C_max,buro_DAYS_ENDDATE_FACT_mean,buro_DAYS_ENDDATE_FACT_std,buro_DAYS_ENDDATE_FACT_min,buro_DAYS_ENDDATE_FACT_max,buro_STATUS_4_mean,buro_STATUS_4_std,buro_STATUS_4_min,buro_STATUS_4_max,buro_STATUS_1_mean,buro_STATUS_1_std,buro_STATUS_1_min,buro_STATUS_1_max,buro_AMT_ANNUITY_mean,buro_AMT_ANNUITY_std,buro_AMT_ANNUITY_min,buro_AMT_ANNUITY_max,buro_STATUS_0_mean,buro_STATUS_0_std,buro_STATUS_0_min,buro_STATUS_0_max,buro_isnull_STATUS_mean,buro_isnull_STATUS_std,buro_isnull_STATUS_min,buro_isnull_STATUS_max,buro_STATUS_5_mean,buro_STATUS_5_std,buro_STATUS_5_min,buro_STATUS_5_max,buro_CREDIT_DAY_OVERDUE_mean,buro_CREDIT_DAY_OVERDUE_std,buro_CREDIT_DAY_OVERDUE_min,buro_CREDIT_DAY_OVERDUE_max,buro_AMT_CREDIT_SUM_mean,buro_AMT_CREDIT_SUM_std,buro_AMT_CREDIT_SUM_min,buro_AMT_CREDIT_SUM_max,buro_AMT_CREDIT_SUM_LIMIT_mean,buro_AMT_CREDIT_SUM_LIMIT_std,buro_AMT_CREDIT_SUM_LIMIT_min,buro_AMT_CREDIT_SUM_LIMIT_max,buro_DAYS_CREDIT_UPDATE_mean,buro_DAYS_CREDIT_UPDATE_std,buro_DAYS_CREDIT_UPDATE_min,buro_DAYS_CREDIT_UPDATE_max,buro_isnull_AMT_CREDIT_MAX_OVERDUE_mean,buro_isnull_AMT_CREDIT_MAX_OVERDUE_std,buro_isnull_AMT_CREDIT_MAX_OVERDUE_min,buro_isnull_AMT_CREDIT_MAX_OVERDUE_max,buro_DAYS_CREDIT_ENDDATE_mean,buro_DAYS_CREDIT_ENDDATE_std,buro_DAYS_CREDIT_ENDDATE_min,buro_DAYS_CREDIT_ENDDATE_max,buro_STATUS_2_mean,buro_STATUS_2_std,buro_STATUS_2_min,buro_STATUS_2_max,buro_DAYS_CREDIT_mean,buro_DAYS_CREDIT_std,buro_DAYS_CREDIT_min,buro_DAYS_CREDIT_max,buro_MONTH_COUNT_mean,buro_MONTH_COUNT_std,buro_MONTH_COUNT_min,buro_MONTH_COUNT_max,buro_AMT_CREDIT_SUM_OVERDUE_mean,buro_AMT_CREDIT_SUM_OVERDUE_std,buro_AMT_CREDIT_SUM_OVERDUE_min,buro_AMT_CREDIT_SUM_OVERDUE_max,buro_STATUS_X_mean,buro_STATUS_X_std,buro_STATUS_X_min,buro_STATUS_X_max,buro_CREDIT_ACTIVE_mode,buro_CREDIT_ACTIVE_unique,buro_CREDIT_CURRENCY_mode,buro_CREDIT_CURRENCY_unique,buro_CREDIT_TYPE_mode,buro_CREDIT_TYPE_unique,buro_BURO_COUNT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1
100001,5.153914,6.440628,0.0,12.829977,0.0,0.0,0,0,0.0,0.0,0,0,,,,,0.0,0.0,0.0,0.0,0.44124,0.428578,0.0,0.966667,-856.142857,263.761526,-1328.0,-544.0,0.0,0.0,0.0,0.0,0.007519,0.019893,0.0,0.052632,3.839271,4.795585,0.0,9.289475,0.336651,0.381334,0.019231,1.0,1.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,12.080036,0.628917,11.356283,12.842652,0.0,0.0,0.0,0.0,-93.142857,77.20412,-155,-6,1.0,0.0,1,1,82.428571,1032.859277,-1329.0,1778.0,0.0,0.0,0.0,0.0,-735.0,489.942514,-1572,-49,24.571429,16.050515,2.0,52.0,0.0,0.0,0.0,0.0,0.21459,0.182611,0.0,0.5,Closed,2,currency 1,1,Consumer credit,1,7
100002,1.551525,4.388375,0.0,12.4122,0.125,0.353553,0,1,0.0,0.0,0,0,4.07202,4.165725,0.0,8.526083,0.0,0.0,0.0,0.0,0.175426,0.263147,0.0,0.8125,-747.375,445.764492,-1185.0,-36.0,0.0,0.0,0.0,0.0,0.255682,0.204094,0.0,0.5,0.0,0.0,0.0,0.0,0.40696,0.196494,0.1875,0.818182,1.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,9.811994,4.084715,0.0,13.017005,1.296646,3.667468,0.0,10.373165,-499.875,518.522472,-1185,-7,0.375,0.517549,0,1,-344.25,648.707892,-1072.0,780.0,0.0,0.0,0.0,0.0,-874.0,431.45104,-1437,-103,13.75,6.363961,4.0,22.0,0.0,0.0,0.0,0.0,0.161932,0.16165,0.0,0.5,Closed,2,currency 1,1,Consumer credit,2,8
100003,0.0,0.0,0.0,0.0,1.0,0.0,1,1,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240741,0.0,0.240741,0.240741,-1047.25,738.485105,-2131.0,-540.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.333333,0.0,0.333333,0.333333,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,11.609754,1.495879,10.010052,13.604791,3.401198,6.802395,0.0,13.604791,-816.0,908.053963,-2131,-43,0.0,0.0,0,0,-544.5,1492.770467,-2434.0,1216.0,0.0,0.0,0.0,0.0,-1400.75,909.826128,-2586,-606,25.0,0.0,25.0,25.0,0.0,0.0,0.0,0.0,0.025641,0.0,0.025641,0.025641,Closed,2,currency 1,1,Consumer credit,2,4
100004,0.0,0.0,0.0,0.0,1.0,0.0,1,1,0.0,0.0,0,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.240741,0.0,0.240741,0.240741,-532.5,212.839141,-683.0,-382.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.333333,0.0,0.333333,0.333333,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,11.456566,0.000283,11.456366,11.456766,0.0,0.0,0.0,0.0,-532.0,212.132034,-682,-382,0.5,0.707107,0,1,-488.5,150.613744,-595.0,-382.0,0.0,0.0,0.0,0.0,-867.0,649.124025,-1326,-408,25.0,0.0,25.0,25.0,0.0,0.0,0.0,0.0,0.025641,0.0,0.025641,0.025641,Closed,1,currency 1,1,Consumer credit,1,2
100005,7.781492,6.911089,0.0,13.205027,0.0,0.0,0,0,0.0,0.0,0,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.128205,0.222058,0.0,0.384615,-639.0,446.869108,-897.0,-123.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.78587,4.825269,0.0,8.357611,0.735043,0.238245,0.538462,1.0,1.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0,0,11.510418,1.544801,10.303169,13.251286,0.0,0.0,0.0,0.0,-54.333333,58.594653,-121,-11,0.666667,0.57735,0,1,439.333333,776.274007,-128.0,1324.0,0.0,0.0,0.0,0.0,-190.666667,162.297053,-373,-62,7.0,5.291503,3.0,13.0,0.0,0.0,0.0,0.0,0.136752,0.174535,0.0,0.333333,Active,2,currency 1,1,Consumer credit,2,3


In [135]:
# clear memory
del buro

## 4.3. PREVIOUS LOAN DATA

### 4.3.1. CHECKS

In [136]:
# check prev data
prev.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [137]:
# check inst data
inst.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [138]:
# check poca data
poca.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [139]:
# check card data
card.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


### 4.3.2. INST DATA

In [140]:
### FEATURE ENGINEERING

# day difference
inst["DAYS_INST_DIF"] = inst["DAYS_INSTALMENT"] - inst["DAYS_ENTRY_PAYMENT"]

# percentage paid
inst["AMT_PERCENT_PAID"] = inst["AMT_PAYMENT"] / inst["AMT_INSTALMENT"]

# logarithms
inst["AMT_INSTALMENT"] = np.log(1 + inst["AMT_INSTALMENT"])
inst["AMT_PAYMENT"]    = np.log(1 + inst["AMT_PAYMENT"])

In [141]:
# count missings
nas = missing_data(inst)
nas

Unnamed: 0,Total,Percent
AMT_PERCENT_PAID,2907,0.021367
DAYS_INST_DIF,2905,0.021352
AMT_PAYMENT,2905,0.021352
DAYS_ENTRY_PAYMENT,2905,0.021352


In [142]:
##### IMPUTE MISSINGS

### PAYMENT INFO

# find variables
payment_vars = ["DAYS_INST_DIF", "AMT_PAYMENT", "DAYS_ENTRY_PAYMENT"]

# dummy indicator
inst["isnull_PAYMENTS"] = (inst[payment_vars].isnull().sum(axis = 1) == 0).astype(int)

# impute stats_vars
for var in payment_vars:
    inst[var].fillna((inst[var].median()), inplace = True)
         
             
### OTHER FEATURES
             
# find variables
other_vars = ["AMT_PERCENT_PAID"]
for var in other_vars:
    inst[var].fillna((inst[var].median()), inplace = True)

In [143]:
### AGGREGATIONS

# count instalments
cnt_inst = inst[["SK_ID_PREV", "NUM_INSTALMENT_NUMBER"]].groupby("SK_ID_PREV").count()
del inst["NUM_INSTALMENT_NUMBER"]

# delete ID_CURR
inst_id = inst[["SK_ID_CURR", "SK_ID_PREV"]]
del inst["SK_ID_CURR"]

# aggregate data
agg_inst = aggregate_data(inst, id_var = "SK_ID_PREV")

# add instalment count
agg_inst["inst_INST_COUNT"] = cnt_inst

# put back ID_CURR
inst_id = inst_id.drop_duplicates()
agg_inst = inst_id.merge(right = agg_inst.reset_index(), how = "right", on = "SK_ID_PREV")
del agg_inst["SK_ID_PREV"]

# aggregate data (round 2)
agg_inst = aggregate_data(agg_inst, id_var = "SK_ID_CURR", label = "inst")

- Preparing the dataset...
- Extracted 0 factors and 8 numerics...
- Aggregating numeric features...
- Final dimensions: (997752, 32)
- Preparing the dataset...
- Extracted 0 factors and 33 numerics...
- Aggregating numeric features...
- Final dimensions: (339587, 132)


In [144]:
# count missings
nas = missing_data(agg_inst)
nas

Unnamed: 0,Total,Percent
inst_AMT_PERCENT_PAID_std_std,99692,29.356836
inst_isnull_PAYMENTS_std_std,99686,29.35507
inst_DAYS_INSTALMENT_std_std,99686,29.35507
inst_AMT_INSTALMENT_std_std,99686,29.35507
inst_DAYS_ENTRY_PAYMENT_std_std,99686,29.35507
inst_DAYS_INST_DIF_std_std,99686,29.35507
inst_AMT_PAYMENT_std_std,99686,29.35507
inst_NUM_INSTALMENT_VERSION_std_std,99686,29.35507
inst_AMT_PERCENT_PAID_max_std,94657,27.874153
inst_AMT_PERCENT_PAID_mean_std,94657,27.874153


In [145]:
# check data
agg_inst.head()

Unnamed: 0_level_0,inst_NUM_INSTALMENT_VERSION_mean_mean,inst_NUM_INSTALMENT_VERSION_mean_std,inst_NUM_INSTALMENT_VERSION_mean_min,inst_NUM_INSTALMENT_VERSION_mean_max,inst_DAYS_INST_DIF_mean_mean,inst_DAYS_INST_DIF_mean_std,inst_DAYS_INST_DIF_mean_min,inst_DAYS_INST_DIF_mean_max,inst_AMT_PAYMENT_min_mean,inst_AMT_PAYMENT_min_std,inst_AMT_PAYMENT_min_min,inst_AMT_PAYMENT_min_max,inst_AMT_INSTALMENT_mean_mean,inst_AMT_INSTALMENT_mean_std,inst_AMT_INSTALMENT_mean_min,inst_AMT_INSTALMENT_mean_max,inst_AMT_INSTALMENT_max_mean,inst_AMT_INSTALMENT_max_std,inst_AMT_INSTALMENT_max_min,inst_AMT_INSTALMENT_max_max,inst_DAYS_ENTRY_PAYMENT_min_mean,inst_DAYS_ENTRY_PAYMENT_min_std,inst_DAYS_ENTRY_PAYMENT_min_min,inst_DAYS_ENTRY_PAYMENT_min_max,inst_DAYS_INST_DIF_std_mean,inst_DAYS_INST_DIF_std_std,inst_DAYS_INST_DIF_std_min,inst_DAYS_INST_DIF_std_max,inst_NUM_INSTALMENT_VERSION_max_mean,inst_NUM_INSTALMENT_VERSION_max_std,inst_NUM_INSTALMENT_VERSION_max_min,inst_NUM_INSTALMENT_VERSION_max_max,inst_AMT_PERCENT_PAID_std_mean,inst_AMT_PERCENT_PAID_std_std,inst_AMT_PERCENT_PAID_std_min,inst_AMT_PERCENT_PAID_std_max,inst_isnull_PAYMENTS_max_mean,inst_isnull_PAYMENTS_max_std,inst_isnull_PAYMENTS_max_min,inst_isnull_PAYMENTS_max_max,inst_DAYS_ENTRY_PAYMENT_max_mean,inst_DAYS_ENTRY_PAYMENT_max_std,inst_DAYS_ENTRY_PAYMENT_max_min,inst_DAYS_ENTRY_PAYMENT_max_max,inst_NUM_INSTALMENT_VERSION_min_mean,inst_NUM_INSTALMENT_VERSION_min_std,inst_NUM_INSTALMENT_VERSION_min_min,inst_NUM_INSTALMENT_VERSION_min_max,inst_AMT_INSTALMENT_std_mean,inst_AMT_INSTALMENT_std_std,inst_AMT_INSTALMENT_std_min,inst_AMT_INSTALMENT_std_max,inst_AMT_PAYMENT_max_mean,inst_AMT_PAYMENT_max_std,inst_AMT_PAYMENT_max_min,inst_AMT_PAYMENT_max_max,inst_AMT_PERCENT_PAID_min_mean,inst_AMT_PERCENT_PAID_min_std,inst_AMT_PERCENT_PAID_min_min,inst_AMT_PERCENT_PAID_min_max,inst_isnull_PAYMENTS_min_mean,inst_isnull_PAYMENTS_min_std,inst_isnull_PAYMENTS_min_min,inst_isnull_PAYMENTS_min_max,inst_inst_INST_COUNT_mean,inst_inst_INST_COUNT_std,inst_inst_INST_COUNT_min,inst_inst_INST_COUNT_max,inst_AMT_PERCENT_PAID_max_mean,inst_AMT_PERCENT_PAID_max_std,inst_AMT_PERCENT_PAID_max_min,inst_AMT_PERCENT_PAID_max_max,inst_AMT_PAYMENT_mean_mean,inst_AMT_PAYMENT_mean_std,inst_AMT_PAYMENT_mean_min,inst_AMT_PAYMENT_mean_max,inst_DAYS_INST_DIF_min_mean,inst_DAYS_INST_DIF_min_std,inst_DAYS_INST_DIF_min_min,inst_DAYS_INST_DIF_min_max,inst_DAYS_ENTRY_PAYMENT_std_mean,inst_DAYS_ENTRY_PAYMENT_std_std,inst_DAYS_ENTRY_PAYMENT_std_min,inst_DAYS_ENTRY_PAYMENT_std_max,inst_DAYS_INSTALMENT_std_mean,inst_DAYS_INSTALMENT_std_std,inst_DAYS_INSTALMENT_std_min,inst_DAYS_INSTALMENT_std_max,inst_DAYS_INSTALMENT_min_mean,inst_DAYS_INSTALMENT_min_std,inst_DAYS_INSTALMENT_min_min,inst_DAYS_INSTALMENT_min_max,inst_isnull_PAYMENTS_mean_mean,inst_isnull_PAYMENTS_mean_std,inst_isnull_PAYMENTS_mean_min,inst_isnull_PAYMENTS_mean_max,inst_DAYS_INSTALMENT_mean_mean,inst_DAYS_INSTALMENT_mean_std,inst_DAYS_INSTALMENT_mean_min,inst_DAYS_INSTALMENT_mean_max,inst_isnull_PAYMENTS_std_mean,inst_isnull_PAYMENTS_std_std,inst_isnull_PAYMENTS_std_min,inst_isnull_PAYMENTS_std_max,inst_NUM_INSTALMENT_VERSION_std_mean,inst_NUM_INSTALMENT_VERSION_std_std,inst_NUM_INSTALMENT_VERSION_std_min,inst_NUM_INSTALMENT_VERSION_std_max,inst_DAYS_ENTRY_PAYMENT_mean_mean,inst_DAYS_ENTRY_PAYMENT_mean_std,inst_DAYS_ENTRY_PAYMENT_mean_min,inst_DAYS_ENTRY_PAYMENT_mean_max,inst_DAYS_INST_DIF_max_mean,inst_DAYS_INST_DIF_max_std,inst_DAYS_INST_DIF_max_min,inst_DAYS_INST_DIF_max_max,inst_DAYS_INSTALMENT_max_mean,inst_DAYS_INSTALMENT_max_std,inst_DAYS_INSTALMENT_max_min,inst_DAYS_INSTALMENT_max_max,inst_AMT_INSTALMENT_min_mean,inst_AMT_INSTALMENT_min_std,inst_AMT_INSTALMENT_min_min,inst_AMT_INSTALMENT_min_max,inst_AMT_PAYMENT_std_mean,inst_AMT_PAYMENT_std_std,inst_AMT_PAYMENT_std_min,inst_AMT_PAYMENT_std_max,inst_AMT_PERCENT_PAID_mean_mean,inst_AMT_PERCENT_PAID_mean_std,inst_AMT_PERCENT_PAID_mean_min,inst_AMT_PERCENT_PAID_mean_max
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1
100001,1.125,0.176777,1.0,1.25,5.916667,13.55288,-3.666667,15.5,8.285749,0.005334,8.281977,8.289521,8.471116,0.256549,8.289709,8.652523,9.026983,1.042529,8.289803,9.764162,-2315.5,849.235244,-2916.0,-1715.0,10.085564,5.281679,6.350853,13.820275,1.5,0.707107,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,-2242.0,868.327127,-2856.0,-1628.0,1.0,0.0,1.0,1.0,0.370628,0.523916,0.000163,0.741093,9.026983,1.042529,8.289803,9.764162,1.0,0.0,1.0,1.0,1.0,0.0,1,1,3.5,0.707107,3,4,1.0,0.0,1.0,1.0,8.471116,0.256549,8.289709,8.652523,-2.5,12.020815,-11.0,6.0,36.844052,8.738705,30.664855,43.02325,34.364917,6.172924,30.0,38.729833,-2312.5,853.477885,-2916.0,-1709.0,1.0,0.0,1.0,1.0,-2275.0,864.084487,-2886.0,-1664.0,0.0,0.0,0.0,0.0,0.25,0.353553,0.0,0.5,-2280.916667,850.531607,-2882.333333,-1679.5,18.0,25.455844,0.0,36.0,-2237.5,874.691088,-2856.0,-1619.0,8.285749,0.005334,8.281977,8.289521,0.370628,0.523916,0.000163,0.741093,1.0,0.0,1.0,1.0
100002,1.052632,,1.052632,1.052632,20.421053,,20.421053,20.421053,9.132679,,9.132679,9.132679,9.224634,,9.224634,9.224634,10.879833,,10.879833,10.879833,-587.0,,-587.0,-587.0,4.925171,,4.925171,4.925171,2.0,,2.0,2.0,0.0,,0.0,0.0,1.0,,1,1,-49.0,,-49.0,-49.0,1.0,,1.0,1.0,0.400825,,0.400825,0.400825,10.879833,,10.879833,10.879833,1.0,,1.0,1.0,1.0,,1,1,19.0,,19,19,1.0,,1.0,1.0,9.224634,,9.224634,9.224634,12.0,,12.0,12.0,172.058877,,172.058877,172.058877,168.81943,,168.81943,168.81943,-565.0,,-565.0,-565.0,1.0,,1.0,1.0,-295.0,,-295.0,-295.0,0.0,,0.0,0.0,0.229416,,0.229416,0.229416,-315.421053,,-315.421053,-315.421053,31.0,,31.0,31.0,-25.0,,-25.0,-25.0,9.132679,,9.132679,9.132679,0.400825,,0.400825,0.400825,1.0,,1.0,1.0
100003,1.047619,0.082479,1.0,1.142857,7.448413,3.422911,4.428571,11.166667,10.457264,1.447079,8.804471,11.496369,10.544809,1.535398,8.81464,11.745057,11.042744,2.210992,8.815564,13.237184,-1283.0,902.581298,-2324.0,-719.0,2.48377,0.985781,1.718249,3.596084,1.333333,0.57735,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,-1063.333333,800.327641,-1985.0,-544.0,1.0,0.0,1.0,1.0,0.221007,0.378419,0.001851,0.657966,11.042744,2.210992,8.815564,13.237184,1.0,0.0,1.0,1.0,1.0,0.0,1,1,8.333333,3.21455,6,12,1.0,0.0,1.0,1.0,10.544809,1.535398,8.81464,11.745057,4.333333,4.163332,1.0,9.0,76.380137,30.572883,54.153178,111.24594,76.366269,27.879912,56.124861,108.166538,-1274.333333,897.827563,-2310.0,-716.0,1.0,0.0,1.0,1.0,-1164.333333,850.637604,-2145.0,-626.0,0.0,0.0,0.0,0.0,0.125988,0.218218,0.0,0.377964,-1171.781746,850.230624,-2151.75,-630.428571,12.0,3.464102,8.0,14.0,-1054.333333,803.569744,-1980.0,-536.0,10.457264,1.447079,8.804471,11.496369,0.221007,0.378419,0.001851,0.657966,1.0,0.0,1.0,1.0
100004,1.333333,,1.333333,1.333333,7.666667,,7.666667,7.666667,8.586393,,8.586393,8.586393,8.81301,,8.81301,8.81301,9.266245,,9.266245,9.266245,-795.0,,-795.0,-795.0,4.163332,,4.163332,4.163332,2.0,,2.0,2.0,0.0,,0.0,0.0,1.0,,1,1,-727.0,,-727.0,-727.0,1.0,,1.0,1.0,0.392513,,0.392513,0.392513,9.266245,,9.266245,9.266245,1.0,,1.0,1.0,1.0,,1,1,3.0,,3,3,1.0,,1.0,1.0,8.81301,,8.81301,8.81301,3.0,,3.0,3.0,34.019602,,34.019602,34.019602,30.0,,30.0,30.0,-784.0,,-784.0,-784.0,1.0,,1.0,1.0,-754.0,,-754.0,-754.0,0.0,,0.0,0.0,0.57735,,0.57735,0.57735,-761.666667,,-761.666667,-761.666667,11.0,,11.0,11.0,-724.0,,-724.0,-724.0,8.586393,,8.586393,8.586393,0.392513,,0.392513,0.392513,1.0,,1.0,1.0
100005,1.111111,,1.111111,1.111111,23.555556,,23.555556,23.555556,8.479325,,8.479325,8.479325,8.623723,,8.623723,8.623723,9.778901,,9.778901,9.778901,-736.0,,-736.0,-736.0,13.510284,,13.510284,13.510284,2.0,,2.0,2.0,0.0,,0.0,0.0,1.0,,1,1,-470.0,,-470.0,-470.0,1.0,,1.0,1.0,0.433192,,0.433192,0.433192,9.778901,,9.778901,9.778901,1.0,,1.0,1.0,1.0,,1,1,9.0,,9,9,1.0,,1.0,1.0,8.623723,,8.623723,8.623723,-1.0,,-1.0,-1.0,90.554005,,90.554005,90.554005,82.158384,,82.158384,82.158384,-706.0,,-706.0,-706.0,1.0,,1.0,1.0,-586.0,,-586.0,-586.0,0.0,,0.0,0.0,0.333333,,0.333333,0.333333,-609.555556,,-609.555556,-609.555556,37.0,,37.0,37.0,-466.0,,-466.0,-466.0,8.479325,,8.479325,8.479325,0.433192,,0.433192,0.433192,1.0,,1.0,1.0


In [146]:
# clear memory
del inst

### 4.3.3. POCA DATA

In [147]:
### FEATURE ENGINEERING

# installments percentage
poca["INSTALLMENTS_PERCENT"] = poca["CNT_INSTALMENT_FUTURE"] / poca["CNT_INSTALMENT"]

In [148]:
# count missings
nas = missing_data(poca)
nas

Unnamed: 0,Total,Percent
INSTALLMENTS_PERCENT,26184,0.261804
CNT_INSTALMENT_FUTURE,26087,0.260835
CNT_INSTALMENT,26071,0.260675


In [149]:
##### IMPUTE MISSINGS
             
### OTHER FEATURES
             
# find variables
other_vars = ["INSTALLMENTS_PERCENT", "CNT_INSTALMENT_FUTURE", "CNT_INSTALMENT"]
for var in other_vars:
    poca[var].fillna((poca[var].median()), inplace = True)

In [None]:
### AGGREGATIONS

# count months
cnt_mon = poca[["SK_ID_PREV", "MONTHS_BALANCE"]].groupby("SK_ID_PREV").count()
del poca["MONTHS_BALANCE"]

# delete ID_CURR
poca_id = poca[["SK_ID_CURR", "SK_ID_PREV"]]
del poca["SK_ID_CURR"]

# aggregate data
agg_poca = aggregate_data(poca, id_var = "SK_ID_PREV")

# add month count
agg_poca["poca_MON_COUNT"] = cnt_mon

# put back ID_CURR
poca_id = poca_id.drop_duplicates()
agg_poca = poca_id.merge(right = agg_poca.reset_index(), how = "right", on = "SK_ID_PREV")
del agg_poca["SK_ID_PREV"]

# aggregate data (round 2)
agg_poca = aggregate_data(agg_poca, id_var = "SK_ID_CURR", label = "poca")

- Preparing the dataset...
- Extracted 1 factors and 5 numerics...
- Aggregating numeric features...
- Aggregating factor features...


In [None]:
# count missings
nas = missing_data(agg_poca)
nas

In [None]:
# check data
agg_poca.head()

In [None]:
# clear memory
del poca

### 4.3.4. CARD DATA

In [None]:
### FEATURE ENGINEERING

# logarithms
card["AMT_BALANCE"]                = np.log(1 + card["AMT_BALANCE"])
card["AMT_CREDIT_LIMIT_ACTUAL"]    = np.log(1 + card["AMT_CREDIT_LIMIT_ACTUAL"])
card["AMT_DRAWINGS_ATM_CURRENT"]   = np.log(1 + card["AMT_DRAWINGS_ATM_CURRENT"])
card["AMT_DRAWINGS_CURRENT"]       = np.log(1 + card["AMT_DRAWINGS_CURRENT"])
card["AMT_DRAWINGS_OTHER_CURRENT"] = np.log(1 + card["AMT_DRAWINGS_OTHER_CURRENT"])
card["AMT_DRAWINGS_POS_CURRENT"]   = np.log(1 + card["AMT_DRAWINGS_POS_CURRENT"])
card["AMT_INST_MIN_REGULARITY"]    = np.log(1 + card["AMT_INST_MIN_REGULARITY"])
card["AMT_PAYMENT_CURRENT"]        = np.log(1 + card["AMT_PAYMENT_CURRENT"])
card["AMT_PAYMENT_TOTAL_CURRENT"]  = np.log(1 + card["AMT_PAYMENT_TOTAL_CURRENT"])
card["AMT_RECEIVABLE_PRINCIPAL"]   = np.log(1 + card["AMT_RECEIVABLE_PRINCIPAL"])
card["AMT_RECIVABLE"]              = np.log(1 + card["AMT_RECIVABLE"])
card["AMT_TOTAL_RECEIVABLE"]       = np.log(1 + card["AMT_TOTAL_RECEIVABLE"])

In [None]:
# count missings
nas = missing_data(card)
nas

In [None]:
##### IMPUTE MISSINGS
             
### AMOUNTS
amnts = ["AMT_PAYMENT_CURRENT", "AMT_DRAWINGS_ATM_CURRENT", "AMT_DRAWINGS_OTHER_CURRENT", "CNT_INSTALMENT_MATURE_CUM"] 
for var in amnts:
    card["isnull_" + var] = card[var].isnull() + 0


### OTHER FEATURES    
    
# find variables
other_vars = ["AMT_PAYMENT_CURRENT", "AMT_DRAWINGS_ATM_CURRENT",
              "AMT_DRAWINGS_OTHER_CURRENT", "CNT_DRAWINGS_POS_CURRENT",
              "CNT_DRAWINGS_OTHER_CURRENT", "AMT_DRAWINGS_POS_CURRENT",
              "CNT_DRAWINGS_ATM_CURRENT", "CNT_INSTALMENT_MATURE_CUM",
              "AMT_INST_MIN_REGULARITY", "AMT_RECIVABLE", "AMT_TOTAL_RECEIVABLE",
              "AMT_RECEIVABLE_PRINCIPAL", "AMT_BALANCE", "AMT_DRAWINGS_CURRENT"]
for var in other_vars:
    card[var].fillna((card[var].median()), inplace = True)

In [None]:
### AGGREGATIONS

# count months
cnt_mon = card[["SK_ID_PREV", "MONTHS_BALANCE"]].groupby("SK_ID_PREV").count()
del card["MONTHS_BALANCE"]

# delete ID_CURR
card_id = card[["SK_ID_CURR", "SK_ID_PREV"]]
del card["SK_ID_CURR"]

# aggregate data
agg_card = aggregate_data(card, id_var = "SK_ID_PREV")

# add month count
agg_card["card_MON_COUNT"] = cnt_mon

# put back ID_CURR
card_id = card_id.drop_duplicates()
agg_card = card_id.merge(right = agg_card.reset_index(), how = "right", on = "SK_ID_PREV")
del agg_card["SK_ID_PREV"]

# aggregate data (round 2)
agg_card = aggregate_data(agg_card, id_var = "SK_ID_CURR", label = "card")

In [None]:
# count missings
nas = missing_data(agg_card)
nas

In [None]:
# check data
agg_card.head()

In [None]:
# clear memory
del card

### 4.3.5. PREV DATA

In [None]:
### FEATURE ENGINEERING

# logarithms
prev["AMT_ANNUITY"]      = np.log(1 + prev["AMT_ANNUITY"])
prev["AMT_APPLICATION"]  = np.log(1 + prev["AMT_APPLICATION"])
prev["AMT_CREDIT"]       = np.log(1 + prev["AMT_CREDIT"])
prev["AMT_DOWN_PAYMENT"] = np.log(1 + prev["AMT_DOWN_PAYMENT"])
prev["AMT_GOODS_PRICE"]  = np.log(1 + prev["AMT_GOODS_PRICE"])

In [None]:
# count missings
nas = missing_data(prev)
nas

In [None]:
##### IMPUTE MISSINGS

### FACTORS
prev_facs = [f for f in prev.columns if prev[f].dtype == "object"]
for var in prev_facs:
    prev[var].fillna("Unknown level", inplace = True)       
    

### MERGED FEATURES

# impute -99 for CARD
cards = prev.filter(like = "card_").columns
for var in cards:
    prev[var].fillna(-99, inplace = True)
    
# impute -90 for INST
insts = prev.filter(like = "inst_").columns
for var in insts:
    prev[var].fillna(-99, inplace = True)
    
# impute -90 for POCA
pocas = prev.filter(like = "poca_").columns
for var in pocas:
    prev[var].fillna(-99, inplace = True)
    
    
### INTERST RATES
rate_vars = ["RATE_INTEREST_PRIVILEGED", "RATE_INTEREST_PRIMARY"]
for var in rate_vars:
    prev[var].fillna(-99, inplace = True)
    
    
### OTHER FEATURES
             
# find variables
other_vars = ["AMT_DOWN_PAYMENT", "RATE_DOWN_PAYMENT", "DAYS_FIRST_DUE",
              "DAYS_LAST_DUE", "DAYS_FIRST_DRAWING", "DAYS_LAST_DUE_1ST_VERSION",
              "DAYS_TERMINATION", "NFLAG_INSURED_ON_APPROVAL", "AMT_GOODS_PRICE",
              "AMT_ANNUITY", "CNT_PAYMENT", "AMT_CREDIT"]
for var in other_vars:
    prev[var].fillna(-99, inplace = True)

In [None]:
### AGGREGATIONS

# count previous loans
cnt_loan = prev[["SK_ID_CURR", "SK_ID_PREV"]].groupby("SK_ID_CURR").count()
del prev["SK_ID_PREV"]

# aggregate data
agg_prev = aggregate_data(prev, id_var = "SK_ID_CURR", label = "prev")

# add previous loan count
agg_prev["prev_LOAN_COUNT"] = cnt_loan

In [None]:
# count missings
nas = missing_data(agg_prev)
nas

In [None]:
# check data
agg_prev.head()

In [None]:
# clear memory
del prev

## 4.4. BUILD DATASET

In [None]:
# merge data
print(appl.shape)
appl = appl.merge(right = agg_buro.reset_index(), how = "left", on = "SK_ID_CURR")
print(appl.shape)
appl = appl.merge(right = agg_prev.reset_index(), how = "left", on = "SK_ID_CURR")
print(appl.shape)
appl = appl.merge(right = agg_inst.reset_index(), how = "left", on = "SK_ID_CURR")
print(appl.shape)
appl = appl.merge(right = agg_poca.reset_index(), how = "left", on = "SK_ID_CURR")
print(appl.shape)
#appl = appl.merge(right = agg_card.reset_index(), how = "left", on = "SK_ID_CURR")
#print(appl.shape)

In [None]:
# count missings
nas = missing_data(appl)
nas

In [None]:
##### IMPUTE MISSINGS

### FACTORS
appl_factors = [f for f in appl.columns if appl[f].dtype == "object"]
for var in appl_factors:
    appl[var].fillna("Unknown level", inplace = True)       


### MERGED FEATURES
dummies = ["buro_MONTH_COUNT_sum", "prev_DAYS_LAST_DUE_sum"]
for var in dummies:
    appl["isnull_" + var] = appl[var].isnull() + 0

# impute -99 for PREV
appls = appl.filter(like = "prev_").columns
for var in appls:
    appl[var].fillna(-99, inplace = True)
    
# impute -99 for BURO
buros = appl.filter(like = "buro_").columns
for var in buros:
    appl[var].fillna(-99, inplace = True)

In [None]:
# label encoder for factors
data_factors = [f for f in appl.columns if appl[f].dtype == "object"]
le = LabelEncoder()
for var in data_factors:
    appl[var] = le.fit_transform(appl[var])

In [None]:
# count missings
nas = missing_data(appl)
nas

In [None]:
# check data
appl.head()

In [None]:
# partitioning
train = appl[appl["SK_ID_CURR"].isin(y["SK_ID_CURR"]) == True]
test  = appl[appl["SK_ID_CURR"].isin(y["SK_ID_CURR"]) == False]
del appl

In [None]:
# check dimensions
print(train.shape)
print(test.shape)

# 5. DATA EXPORT

In [None]:
# export CSV
train.to_csv("../data/prepared/train_redd.csv", index = False, float_format = "%.8f")
test.to_csv("../data/prepared/test_redd.csv",   index = False, float_format = "%.8f")
y.to_csv("../data/prepared/y_redd.csv",         index = False, float_format = "%.8f")