# 1. SETTINGS

In [None]:
# libraries
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# garbage collection
import gc
gc.enable()

# 2. FUNCTIONS

In [None]:
##### FUNCTION FOR COUNTING MISSINGS
def count_missings(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending = False)
    table = pd.concat([total, percent], axis = 1, keys = ["Total", "Percent"])
    table = table[table["Total"] > 0]
    return table

In [None]:
##### FUNCTION FOR CREATING LOGARITHMS
def convert_days(data, features, t = 12, rounding = True, replace = False):
    for var in features:
        if replace == True:
            if rounding == True:
                data[var] = round(-data[var]/t)
            else:
                data[var] = -data[var]/t
            data[var][data[var] < 0] = None
        else:
            if rounding == True:
                data["CONVERTED_" + str(var)] = round(-data[var]/t)
            else:
                data["CONVERTED_" + str(var)] = -data[var]/t
            data["CONVERTED_" + str(var)][data["CONVERTED_" + str(var)] < 0] = None
    return data

In [None]:
##### FUNCTION FOR CREATING LOGARITHMS
def create_logs(data, features, replace = False):
    for var in features:
        if replace == True:
            data[var] = np.log(data[var].abs() + 1)
        else:
            data["LOG_" + str(var)] = np.log(data[var].abs() + 1)      
    return data

In [None]:
##### FUNCTION FOR CREATING FLAGS FOR MISSINGS
def create_null_flags(data, features = None):
    if features == None:
        features = data.columns
    for var in features:
        num_null = data[var].isnull() + 0
        if num_null.sum() > 0:
            data["ISNULL_" + str(var)] = num_null
    return data

In [None]:
##### FUNCTION FOR TREATING FACTORS
def treat_factors(data, method = "label"):
    
    # label encoding
    if method == "label":
        factors = [f for f in data.columns if data[f].dtype == "object"]
        for var in factors:
            data[var], _ = pd.factorize(data[var])
        
    # dummy encoding
    if method == "dummy":
        data = pd.get_dummies(data, drop_first = True)
    
    # dataset
    return data

In [None]:
##### FUNCTION FOR COMPUTING ACCEPT/REJECT RATIOS
def compute_accept_reject_ratio(data, lags = [1, 3, 5]):
    
    # preparations
    dec_prev = data[["SK_ID_CURR", "SK_ID_PREV", "DAYS_DECISION", "NAME_CONTRACT_STATUS"]]
    dec_prev["DAYS_DECISION"] = -dec_prev["DAYS_DECISION"]
    dec_prev = dec_prev.sort_values(by = ["SK_ID_CURR", "DAYS_DECISION"])
    dec_prev = pd.get_dummies(dec_prev)
     
    # compuatation
    for t in lags:
        
        # acceptance ratios
        tmp = dec_prev[["SK_ID_CURR", "NAME_CONTRACT_STATUS_Approved"]].groupby(["SK_ID_CURR"]).head(1)
        tmp = tmp.groupby(["SK_ID_CURR"], as_index = False).mean()
        tmp.columns = ["SK_ID_CURR", "APPROVE_RATIO_" + str(t)]
        data = data.merge(tmp, how = "left", on = "SK_ID_CURR")
        
        # rejection ratios
        tmp = dec_prev[["SK_ID_CURR", "NAME_CONTRACT_STATUS_Refused"]].groupby(["SK_ID_CURR"]).head(1)
        tmp = tmp.groupby(["SK_ID_CURR"], as_index = False).mean()
        tmp.columns = ["SK_ID_CURR", "REJECT_RATIO_" + str(t)]
        data = data.merge(tmp, how = "left", on = "SK_ID_CURR")
        
    # dataset
    return data

In [None]:
##### FUNCTION FOR AGGREGATING DATA
def aggregate_data(data, id_var, label = None):
    
    
    ### SEPARATE FEATURES
  
    # display info
    print("- Preparing the dataset...")

    # find factors
    data_factors = [f for f in data.columns if data[f].dtype == "object"]
    
    # partition subsets
    num_data = data[list(set(data.columns) - set(data_factors))]
    fac_data = data[[id_var] + data_factors]
    
    # display info
    num_facs = fac_data.shape[1] - 1
    num_nums = num_data.shape[1] - 1
    print("- Extracted %.0f factors and %.0f numerics..." % (num_facs, num_nums))


    ##### AGGREGATION
 
    # aggregate numerics
    if (num_nums > 0):
        print("- Aggregating numeric features...")
        num_data = num_data.groupby(id_var).agg(["mean", "std", "min", "max"])
        num_data.columns = ["_".join(col).strip() for col in num_data.columns.values]
        num_data = num_data.sort_index()

    # aggregate factors
    if (num_facs > 0):
        print("- Aggregating factor features...")
        fac_data = fac_data.groupby(id_var).agg([("mode",   lambda x: scipy.stats.mode(x)[0][0]),
                                                 ("unique", lambda x: x.nunique())])
        fac_data.columns = ["_".join(col).strip() for col in fac_data.columns.values]
        fac_data = fac_data.sort_index()


    ##### MERGER

    # merge numerics and factors
    if ((num_facs > 0) & (num_nums > 0)):
        agg_data = pd.concat([num_data, fac_data], axis = 1)
    
    # use factors only
    if ((num_facs > 0) & (num_nums == 0)):
        agg_data = fac_data
        
    # use numerics only
    if ((num_facs == 0) & (num_nums > 0)):
        agg_data = num_data
        

    ##### LAST STEPS

    # update labels
    if label != None:
        agg_data.columns = [label + "_" + str(col) for col in agg_data.columns]
    
    # impute zeros for SD
    #stdevs = agg_data.filter(like = "_std").columns
    #for var in stdevs:
    #    agg_data[var].fillna(0, inplace = True)

    # display info
    print("- Final dimensions:", agg_data.shape)
    
    # return dataset
    return agg_data

# 3. DATA IMPORT

In [None]:
# import data
train = pd.read_csv("../data/raw/application_train.csv")
test  = pd.read_csv("../data/raw/application_test.csv")
buro  = pd.read_csv("../data/raw/bureau.csv")
bbal  = pd.read_csv("../data/raw/bureau_balance.csv")
prev  = pd.read_csv("../data/raw/previous_application.csv")
card  = pd.read_csv("../data/raw/credit_card_balance.csv")
poca  = pd.read_csv("../data/raw/POS_CASH_balance.csv")
inst  = pd.read_csv("../data/raw/installments_payments.csv")

In [None]:
# check dimensions
print("Application:", train.shape, test.shape)
print("Buro:", buro.shape)
print("Bbal:", bbal.shape)
print("Prev:", prev.shape)
print("Card:", card.shape)
print("Poca:", poca.shape)
print("Inst:", inst.shape)

In [None]:
# extract target
y = train[["SK_ID_CURR", "TARGET"]]
del train["TARGET"]

# 4. PREPROCESSING

## 4.1. APPLICATION DATA

In [None]:
# concatenate application data
appl = pd.concat([train, test])
del train, test

In [None]:
### FEATURE ENGINEERING

# income ratios
appl["CREDIT_BY_INCOME"]      = appl["AMT_CREDIT"]      / appl["AMT_INCOME_TOTAL"]
appl["ANNUITY_BY_INCOME"]     = appl["AMT_ANNUITY"]     / appl["AMT_INCOME_TOTAL"]
appl["GOODS_PRICE_BY_INCOME"] = appl["AMT_GOODS_PRICE"] / appl["AMT_INCOME_TOTAL"]
appl["INCOME_PER_PERSON"]     = appl["AMT_INCOME_TOTAL"] / appl["CNT_FAM_MEMBERS"]

# career ratio
appl["PERCENT_WORKED"] = appl["DAYS_EMPLOYED"] / appl["DAYS_BIRTH"]
appl["PERCENT_WORKED"][appl["PERCENT_WORKED"] < 0] = None

# number of adults
appl["CNT_ADULTS"] = appl["CNT_FAM_MEMBERS"] - appl["CNT_CHILDREN"]
appl['CHILDREN_RATIO'] = appl['CNT_CHILDREN'] / appl['CNT_FAM_MEMBERS']

# number of overall payments
appl['ANNUITY LENGTH'] = appl['AMT_CREDIT'] / appl['AMT_ANNUITY']

# external sources
#appl["EXT_SOURCE_MIN"]  = appl[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis = 1)
#appl["EXT_SOURCE_MAX"]  = appl[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis = 1)
appl["EXT_SOURCE_MEAN"] = appl[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis = 1)
#appl["EXT_SOURCE_SD"]   = appl[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis = 1)
appl["NUM_EXT_SOURCES"] = 3 - (appl["EXT_SOURCE_1"].isnull().astype(int) +
                               appl["EXT_SOURCE_2"].isnull().astype(int) +
                               appl["EXT_SOURCE_3"].isnull().astype(int))

# number of documents
doc_vars = ["FLAG_DOCUMENT_2",  "FLAG_DOCUMENT_3",  "FLAG_DOCUMENT_4",  "FLAG_DOCUMENT_5",  "FLAG_DOCUMENT_6",
            "FLAG_DOCUMENT_7",  "FLAG_DOCUMENT_8",  "FLAG_DOCUMENT_9",  "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_11",
            "FLAG_DOCUMENT_12", "FLAG_DOCUMENT_13", "FLAG_DOCUMENT_14", "FLAG_DOCUMENT_15", "FLAG_DOCUMENT_16",
            "FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18", "FLAG_DOCUMENT_19", "FLAG_DOCUMENT_20", "FLAG_DOCUMENT_21"]
appl["NUM_DOCUMENTS"] = appl[doc_vars].sum(axis = 1)

# application date
appl["DAY_APPR_PROCESS_START"] = "Working day"
appl["DAY_APPR_PROCESS_START"][(appl["WEEKDAY_APPR_PROCESS_START"] == "SATURDAY") |
                               (appl["WEEKDAY_APPR_PROCESS_START"] == "SUNDAY")] = "Weekend"

# logarithms
log_vars = ["AMT_CREDIT", "AMT_INCOME_TOTAL", "AMT_GOODS_PRICE", "AMT_ANNUITY"]
appl = create_logs(appl, log_vars, replace = True)

# convert days
day_vars = ["DAYS_BIRTH", "DAYS_REGISTRATION", "DAYS_ID_PUBLISH", "DAYS_EMPLOYED", "DAYS_LAST_PHONE_CHANGE"]
appl = convert_days(appl, day_vars, t = 30, rounding = True, replace = True)

# age ratios
appl["OWN_CAR_AGE_RATIO"] = appl["OWN_CAR_AGE"] / appl["DAYS_BIRTH"]
appl["DAYS_ID_PUBLISHED_RATIO"] = appl["DAYS_ID_PUBLISH"] / appl["DAYS_BIRTH"]
appl["DAYS_REGISTRATION_RATIO"] = appl["DAYS_REGISTRATION"] / appl["DAYS_BIRTH"]
appl["DAYS_LAST_PHONE_CHANGE_RATIO"] = appl["DAYS_LAST_PHONE_CHANGE"] / appl["DAYS_BIRTH"]


##### FEATURE REMOVAL
drops = ['APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 
         'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI',
         'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI','YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI',
         'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'COMMONAREA_MODE','ELEVATORS_MODE', 'ENTRANCES_MODE', 
         'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 
         'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'TOTALAREA_MODE',  'YEARS_BEGINEXPLUATATION_MODE']
appl = appl.drop(columns = drops)

In [None]:
# rename features
appl.columns = ["SK_ID_CURR"] + ["app_" + str(col) for col in appl.columns if col not in "SK_ID_CURR"]

In [None]:
# check data
appl.head()

In [None]:
# count missings
nas = count_missings(appl)
nas.head()

## 4.2. CREDIT BUREAU DATA

### 4.2.1. BBAL DATA

In [None]:
# check bbal data
bbal.head()

In [None]:
### FEATURE ENGINEERING

# loan default score
bbal["NUM_STATUS"] = 0
bbal["NUM_STATUS"][bbal["STATUS"] == "X"] = None
bbal["NUM_STATUS"][bbal["STATUS"] == "1"] = 1
bbal["NUM_STATUS"][bbal["STATUS"] == "2"] = 2
bbal["NUM_STATUS"][bbal["STATUS"] == "3"] = 3
bbal["NUM_STATUS"][bbal["STATUS"] == "4"] = 4
bbal["NUM_STATUS"][bbal["STATUS"] == "5"] = 5
bbal["LOAN_SCORE"] = bbal["NUM_STATUS"] / (abs(bbal["MONTHS_BALANCE"]) + 1)
loan_score = bbal.groupby("SK_ID_BUREAU", as_index = False).LOAN_SCORE.sum()
del bbal["NUM_STATUS"]
del bbal["LOAN_SCORE"]

# dummy encoding for STATUS
bbal = pd.get_dummies(bbal, columns = ["STATUS"], prefix = "STATUS")

In [None]:
# count missings
nas = count_missings(bbal)
nas.head()

In [None]:
### AGGREGATIONS

# total month count
cnt_mon = bbal[["SK_ID_BUREAU", "MONTHS_BALANCE"]].groupby("SK_ID_BUREAU").count()
del bbal["MONTHS_BALANCE"]

# aggregate data
agg_bbal = bbal.groupby("SK_ID_BUREAU").mean()

# add total month count
agg_bbal["MONTH_COUNT"] = cnt_mon

# add loan score
agg_bbal = agg_bbal.merge(loan_score, how = "left", on = "SK_ID_BUREAU")

In [None]:
# count missings
nas = count_missings(agg_bbal)
nas.head()

In [None]:
# check data
agg_bbal.head()

In [None]:
# clear memory
del bbal

### 4.2.2. BURO DATA

In [None]:
# check buro data
buro.head()

In [None]:
### MERGE
buro = buro.merge(right = agg_bbal.reset_index(), how = "left", on = "SK_ID_BUREAU")

In [None]:
##### FEATURE ENGINEERING

# number of buro loans 
cnt_buro = buro[["SK_ID_CURR", "SK_ID_BUREAU"]].groupby(["SK_ID_CURR"], as_index = False).count()
cnt_buro.columns = ["SK_ID_CURR", "CNT_BURO_LOANS"]
buro = buro.merge(cnt_buro, how = "left", on = "SK_ID_CURR")

# amount ratios
buro["AMT_SUM_OVERDUE_RATIO_1"] = buro["AMT_CREDIT_SUM_OVERDUE"] / buro["AMT_ANNUITY"]
buro["AMT_SUM_OVERDUE_RATIO_2"] = buro["AMT_CREDIT_SUM_OVERDUE"] / buro["AMT_CREDIT_SUM"]
buro["AMT_MAX_OVERDUE_RATIO_1"] = buro["AMT_CREDIT_MAX_OVERDUE"] / buro["AMT_ANNUITY"]
buro["AMT_MAX_OVERDUE_RATIO_2"] = buro["AMT_CREDIT_MAX_OVERDUE"] / buro["AMT_CREDIT_SUM"]
buro["AMT_SUM_DEBT_RATIO_1"]    = buro["AMT_CREDIT_SUM_DEBT"] / buro["AMT_CREDIT_SUM"]
buro["AMT_SUM_DEBT_RATIO_2"]    = buro["AMT_CREDIT_SUM_DEBT"] / buro["AMT_CREDIT_SUM_LIMIT"]

# logarithms
log_vars = ["AMT_CREDIT_SUM", "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_LIMIT", "AMT_CREDIT_SUM_OVERDUE", "AMT_ANNUITY"]
buro = create_logs(buro, log_vars, replace = True)

# convert days
day_vars = ["DAYS_CREDIT", "CREDIT_DAY_OVERDUE", "DAYS_CREDIT_ENDDATE", "DAYS_ENDDATE_FACT", "DAYS_CREDIT_UPDATE"]
buro = convert_days(buro, day_vars, t = 1, rounding = False, replace = True)

# recency-weighted loan score
buro["WEIGHTED_LOAN_SCORE"] = buro["LOAN_SCORE"] / (buro["DAYS_CREDIT"] / 12)

# day differences
buro["DAYS_END_DIFF_1"] = buro["DAYS_ENDDATE_FACT"]   - buro["DAYS_CREDIT_ENDDATE"]
buro["DAYS_END_DIFF_2"] = buro["DAYS_CREDIT_UPDATE"]  - buro["DAYS_CREDIT_ENDDATE"]
buro["DAYS_DURATION_1"] = buro["DAYS_CREDIT_ENDDATE"] - buro["DAYS_CREDIT"]
buro["DAYS_DURATION_2"] = buro["DAYS_ENDDATE_FACT"]   - buro["DAYS_CREDIT"]

# number of active buro loans
cnt_buro = buro[["SK_ID_CURR", "CREDIT_ACTIVE"]]
cnt_buro.columns = ["SK_ID_CURR", "CNT_BURO_ACTIVE"]
cnt_buro = cnt_buro[cnt_buro["CNT_BURO_ACTIVE"] == "Active"]
cnt_buro = cnt_buro[["SK_ID_CURR", "CNT_BURO_ACTIVE"]].groupby(["SK_ID_CURR"], as_index = False).count()
buro = buro.merge(cnt_buro, how = "left", on = "SK_ID_CURR")
buro["CNT_BURO_ACTIVE"].fillna(0, inplace = True)

# number of closed buro loans
cnt_buro = buro[["SK_ID_CURR", "CREDIT_ACTIVE"]]
cnt_buro.columns = ["SK_ID_CURR", "CNT_BURO_CLOSED"]
cnt_buro = cnt_buro[cnt_buro["CNT_BURO_CLOSED"] == "Closed"]
cnt_buro = cnt_buro[["SK_ID_CURR", "CNT_BURO_CLOSED"]].groupby(["SK_ID_CURR"], as_index = False).count()
buro = buro.merge(cnt_buro, how = "left", on = "SK_ID_CURR")
buro["CNT_BURO_CLOSED"].fillna(0, inplace = True)

# number of defaulted buro loans
cnt_buro = buro[["SK_ID_CURR", "CREDIT_ACTIVE"]]
cnt_buro.columns = ["SK_ID_CURR", "CNT_BURO_BAD"]
cnt_buro = cnt_buro[cnt_buro["CNT_BURO_BAD"] == "Bad debt"]
cnt_buro = cnt_buro[["SK_ID_CURR", "CNT_BURO_BAD"]].groupby(["SK_ID_CURR"], as_index = False).count()
buro = buro.merge(cnt_buro, how = "left", on = "SK_ID_CURR")
buro["CNT_BURO_BAD"].fillna(0, inplace = True)

In [None]:
# dummy encodnig for factors
buro = pd.get_dummies(buro, drop_first = True)

In [None]:
# count missings
nas = count_missings(buro)
nas.head()

In [None]:
### AGGREGATIONS

# count previous buro loans
cnt_buro = buro[["SK_ID_CURR", "SK_ID_BUREAU"]].groupby("SK_ID_CURR").count()
del buro["SK_ID_BUREAU"]

# aggregate data
agg_buro = aggregate_data(buro, id_var = "SK_ID_CURR", label = "buro")

# add buro loan count
agg_buro["buro_BURO_COUNT"] = cnt_buro

# clean up
omits = ["WEIGHTED_LOAN_SCORE"]
for var in omits:
    del agg_buro["buro_" + str(var) + "_std"]
    del agg_buro["buro_" + str(var) + "_min"]
    del agg_buro["buro_" + str(var) + "_max"]

In [None]:
# count missings
nas = count_missings(agg_buro)
nas.head()

In [None]:
# check data
agg_buro.head()

In [None]:
# clear memory
del buro

## 4.3. PREVIOUS LOAN DATA

### 4.3.1. INST DATA

In [None]:
# check inst data
inst.head()

In [None]:
### FEATURE ENGINEERING

# days past due and days before due (no negative values)
inst['DPD'] = inst['DAYS_ENTRY_PAYMENT'] - inst['DAYS_INSTALMENT']
inst['DBD'] = inst['DAYS_INSTALMENT'] - inst['DAYS_ENTRY_PAYMENT']
inst['DPD'] = inst['DPD'].apply(lambda x: x if x > 0 else 0)
inst['DBD'] = inst['DBD'].apply(lambda x: x if x > 0 else 0)

# percentage and difference paid in each installment 
inst['PAYMENT_PERC'] = inst['AMT_PAYMENT'] / inst['AMT_INSTALMENT']
inst['PAYMENT_DIFF'] = inst['AMT_INSTALMENT'] - inst['AMT_PAYMENT']

# logarithms
log_vars = ["AMT_INSTALMENT", "AMT_PAYMENT"]
inst = create_logs(inst, log_vars, replace = True)

In [None]:
# dummy encodnig for factors
inst = pd.get_dummies(inst, drop_first = True)

In [None]:
# count missings
nas = count_missings(inst)
nas.head()

In [None]:
### AGGREGATIONS

# count instalments
cnt_inst = inst[["SK_ID_PREV", "NUM_INSTALMENT_NUMBER"]].groupby("SK_ID_PREV").count()
del inst["NUM_INSTALMENT_NUMBER"]

# delete ID_CURR
inst_id = inst[["SK_ID_CURR", "SK_ID_PREV"]]
del inst["SK_ID_CURR"]

# aggregate data
agg_inst = aggregate_data(inst, id_var = "SK_ID_PREV")

# add instalment count
agg_inst["inst_INST_COUNT"] = cnt_inst

# put back ID_CURR
inst_id = inst_id.drop_duplicates()
agg_inst = inst_id.merge(right = agg_inst.reset_index(), how = "right", on = "SK_ID_PREV")
del agg_inst["SK_ID_PREV"]

# aggregate data (round 2)
agg_inst = aggregate_data(agg_inst, id_var = "SK_ID_CURR", label = "inst")

In [None]:
# count missings
nas = count_missings(agg_inst)
nas.head()

In [None]:
# check data
agg_inst.head()

In [None]:
# clear memory
del inst

### 4.3.2. POCA DATA

In [None]:
# check poca data
poca.head()

In [None]:
### FEATURE ENGINEERING

# installments percentage
poca["INSTALLMENTS_PERCENT"] = poca["CNT_INSTALMENT_FUTURE"] / poca["CNT_INSTALMENT"]

In [None]:
# dummy encodnig for factors
poca = pd.get_dummies(poca, drop_first = True)

In [None]:
# count missings
nas = count_missings(poca)
nas.head()

In [None]:
### AGGREGATIONS

# count months
cnt_mon = poca[["SK_ID_PREV", "MONTHS_BALANCE"]].groupby("SK_ID_PREV").count()
del poca["MONTHS_BALANCE"]

# delete ID_CURR
poca_id = poca[["SK_ID_CURR", "SK_ID_PREV"]]
del poca["SK_ID_CURR"]

# aggregate data
agg_poca = aggregate_data(poca, id_var = "SK_ID_PREV")

# add month count
agg_poca["poca_MON_COUNT"] = cnt_mon

# put back ID_CURR
poca_id = poca_id.drop_duplicates()
agg_poca = poca_id.merge(right = agg_poca.reset_index(), how = "right", on = "SK_ID_PREV")
del agg_poca["SK_ID_PREV"]

# aggregate data (round 2)
agg_poca = aggregate_data(agg_poca, id_var = "SK_ID_CURR", label = "poca")

In [None]:
# count missings
nas = count_missings(agg_poca)
nas.head()

In [None]:
# check data
agg_poca.head()

In [None]:
# clear memory
del poca

### 4.3.3. CARD DATA

In [None]:
# check card data
card.head()

In [None]:
### FEATURE ENGINEERING

# logarithms
log_vars = ["AMT_BALANCE", "AMT_CREDIT_LIMIT_ACTUAL", "AMT_DRAWINGS_ATM_CURRENT", "AMT_DRAWINGS_CURRENT",
            "AMT_DRAWINGS_OTHER_CURRENT", "AMT_DRAWINGS_POS_CURRENT", "AMT_INST_MIN_REGULARITY",
            "AMT_PAYMENT_CURRENT", "AMT_PAYMENT_TOTAL_CURRENT", "AMT_RECEIVABLE_PRINCIPAL",
            "AMT_RECIVABLE", "AMT_TOTAL_RECEIVABLE"]
card = create_logs(card, log_vars, replace = True)

In [None]:
# dummy encodnig for factors
card = pd.get_dummies(card, drop_first = True)

In [None]:
# count missings
nas = count_missings(card)
nas.head()

In [None]:
### AGGREGATIONS

# count months
cnt_mon = card[["SK_ID_PREV", "MONTHS_BALANCE"]].groupby("SK_ID_PREV").count()
del card["MONTHS_BALANCE"]

# delete ID_CURR
card_id = card[["SK_ID_CURR", "SK_ID_PREV"]]
del card["SK_ID_CURR"]

# aggregate data
agg_card = aggregate_data(card, id_var = "SK_ID_PREV")

# add month count
agg_card["card_MON_COUNT"] = cnt_mon

# put back ID_CURR
card_id = card_id.drop_duplicates()
agg_card = card_id.merge(right = agg_card.reset_index(), how = "right", on = "SK_ID_PREV")
del agg_card["SK_ID_PREV"]

# aggregate data (round 2)
agg_card = aggregate_data(agg_card, id_var = "SK_ID_CURR", label = "card")

In [None]:
# count missings
nas = count_missings(agg_card)
nas.head()

In [None]:
# check data
agg_card.head()

In [None]:
# clear memory
del card

### 4.3.4. PREV DATA

In [None]:
# check card data
prev.head()

In [None]:
### FEATURE ENGINEERING

# amount ratios
prev["AMT_GIVEN_RATIO_1"]  = prev["AMT_CREDIT"] / prev["AMT_APPLICATION"]
prev["AMT_GIVEN_RATIO_2"]  = prev["AMT_GOODS_PRICE"] / prev["AMT_APPLICATION"]
prev["DOWN_PAYMENT_RATIO"] = prev["AMT_DOWN_PAYMENT"] / prev["AMT_APPLICATION"]

# logarithms
log_vars = ["AMT_CREDIT", "AMT_ANNUITY", "AMT_APPLICATION", "AMT_DOWN_PAYMENT", "AMT_GOODS_PRICE"]
prev = create_logs(prev, log_vars, replace = True)

# convert days
day_vars = ["DAYS_FIRST_DRAWING", "DAYS_FIRST_DUE", "DAYS_LAST_DUE_1ST_VERSION", 
            "DAYS_LAST_DUE", "DAYS_TERMINATION", "DAYS_DECISION"]
prev = convert_days(prev, day_vars, t = 1, rounding = False, replace = True)

# number of applications 
cnt_prev = prev[["SK_ID_CURR", "SK_ID_PREV"]].groupby(["SK_ID_CURR"], as_index = False).count()
cnt_prev.columns = ["SK_ID_CURR", "CNT_PREV_APPLICATIONS"]
prev = prev.merge(cnt_prev, how = "left", on = "SK_ID_CURR")

# number of contracts
cnt_prev = prev[["SK_ID_CURR", "FLAG_LAST_APPL_PER_CONTRACT"]]
cnt_prev.columns = ["SK_ID_CURR", "CNT_PREV_CONTRACTS"]
cnt_prev = cnt_prev[cnt_prev["CNT_PREV_CONTRACTS"] == "Y"]
cnt_prev = cnt_prev[["SK_ID_CURR", "CNT_PREV_CONTRACTS"]].groupby(["SK_ID_CURR"], as_index = False).count()
prev = prev.merge(cnt_prev, how = "left", on = "SK_ID_CURR")

# number ratio
prev["APPL_PER_CONTRACT_RATIO"] = prev["CNT_PREV_APPLICATIONS"] / prev["CNT_PREV_CONTRACTS"]

# loan decision ratios
prev = compute_accept_reject_ratio(prev, lags = [1, 3, 5])

# day differences
prev["DAYS_DUE_DIFF_1"] = prev["DAYS_LAST_DUE_1ST_VERSION"] - prev["DAYS_FIRST_DUE"]
prev["DAYS_DUE_DIFF_2"] = prev["DAYS_LAST_DUE"] - prev["DAYS_FIRST_DUE"]
prev["DAYS_TERMINATION_DIFF_1"] = prev["DAYS_TERMINATION"] - prev["DAYS_FIRST_DRAWING"]
prev["DAYS_TERMINATION_DIFF_2"] = prev["DAYS_TERMINATION"] - prev["DAYS_FIRST_DUE"]
prev["DAYS_TERMINATION_DIFF_3"] = prev["DAYS_TERMINATION"] - prev["DAYS_LAST_DUE"]

# application dates
prev["DAY_APPR_PROCESS_START"] = "Working day"
prev["DAY_APPR_PROCESS_START"][(prev["WEEKDAY_APPR_PROCESS_START"] == "SATURDAY") |
                               (prev["WEEKDAY_APPR_PROCESS_START"] == "SUNDAY")] = "Weekend"


##### FEATURE REMOVAL
drops = ["NAME_CLIENT_TYPE", "SK_ID_PREV"]
prev = prev.drop(columns = drops)

In [None]:
# dummy encodnig for factors
prev = pd.get_dummies(prev, drop_first = True)

In [None]:
# count missings
nas = count_missings(prev)
nas.head()

In [None]:
### AGGREGATIONS

# aggregate data
agg_prev = aggregate_data(prev, id_var = "SK_ID_CURR", label = "prev")

# clean up
omits = ["APPROVE_RATIO_1", "APPROVE_RATIO_3", "APPROVE_RATIO_5",  
         "REJECT_RATIO_1", "REJECT_RATIO_3",  "REJECT_RATIO_5", 
         "FLAG_LAST_APPL_PER_CONTRACT_Y", "CNT_PREV_CONTRACTS", "CNT_PREV_APPLICATIONS", 
         "APPL_PER_CONTRACT_RATIO"]
for var in omits:
    del agg_prev["prev_" + str(var) + "_std"]
    del agg_prev["prev_" + str(var) + "_min"]
    del agg_prev["prev_" + str(var) + "_max"]

In [None]:
# count missings
nas = count_missings(agg_prev)
nas.head()

In [None]:
# check data
agg_prev.head()

In [None]:
# clear memory
del prev

# 5. DATA EXPORT

In [None]:
# merge data
print(appl.shape)
appl = appl.merge(right = agg_buro.reset_index(), how = "left", on = "SK_ID_CURR")
print(appl.shape)
del agg_buro
appl = appl.merge(right = agg_prev.reset_index(), how = "left", on = "SK_ID_CURR")
print(appl.shape)
del agg_prev
appl = appl.merge(right = agg_inst.reset_index(), how = "left", on = "SK_ID_CURR")
print(appl.shape)
del agg_inst
appl = appl.merge(right = agg_poca.reset_index(), how = "left", on = "SK_ID_CURR")
print(appl.shape)
del agg_poca
appl = appl.merge(right = agg_card.reset_index(), how = "left", on = "SK_ID_CURR")
print(appl.shape)
del agg_card

In [None]:
##### CROSS-TABLE FEATURE ENGINEERING

# credit ratios
appl["mix_AMT_PREV_ANNUITY_RATIO"]     = appl["app_AMT_ANNUITY"] / appl["prev_AMT_ANNUITY_mean"]
appl["mix_AMT_PREV_CREDIT_RATIO"]      = appl["app_AMT_CREDIT"] / appl["prev_AMT_CREDIT_mean"]
appl["mix_AMT_PREV_GOODS_PRICE_RATIO"] = appl["app_AMT_GOODS_PRICE"] / appl["prev_AMT_GOODS_PRICE_mean"]
appl["mix_AMT_BURO_ANNUITY_RATIO"]     = appl["app_AMT_ANNUITY"] / appl["buro_AMT_ANNUITY_mean"]
appl["mix_AMT_BURO_CREDIT_RATIO"]      = appl["app_AMT_CREDIT"] / appl["buro_AMT_CREDIT_SUM_mean"]

In [None]:
# dummy encodnig for factors
appl = pd.get_dummies(appl, drop_first = True)

In [None]:
# label encoder for factors
#data_factors = [f for f in appl.columns if appl[f].dtype == "object"]
#for var in data_factors:
#    appl[var], _ = pd.factorize(appl[var])

In [None]:
# count missings
nas = count_missings(appl)
nas.head()

In [None]:
# partitioning
train = appl[appl["SK_ID_CURR"].isin(y["SK_ID_CURR"]) == True]
test  = appl[appl["SK_ID_CURR"].isin(y["SK_ID_CURR"]) == False]
del appl

In [None]:
# check dimensions
print(train.shape)
print(test.shape)

In [None]:
# export CSV
train.to_csv("../data/prepared/train_full_cor.csv", index = False, float_format = "%.8f")
test.to_csv("../data/prepared/test_full_cor.csv",   index = False, float_format = "%.8f")
y.to_csv("../data/prepared/y_full_cor.csv",         index = False, float_format = "%.8f")