# 1. SETTINGS

In [1]:
# libraries
import pandas as pd
import numpy as np
import scipy.stats
import gc

In [2]:
# garbage collection
gc.enable()

# 2. DATA PREPARATION

## 2.1. IMPORT

In [3]:
# import data
train = pd.read_csv("../data/raw/application_train.csv")
test  = pd.read_csv("../data/raw/application_test.csv")
prev  = pd.read_csv("../data/raw/previous_application.csv")
buro  = pd.read_csv("../data/raw/bureau.csv")
bbal  = pd.read_csv("../data/raw/bureau_balance.csv")
card  = pd.read_csv("../data/raw/credit_card_balance.csv")
poca  = pd.read_csv("../data/raw/POS_CASH_balance.csv")
inst  = pd.read_csv("../data/raw/installments_payments.csv")

In [4]:
# check dimensions
print(train.shape)
print(test.shape)
print(prev.shape)
print(buro.shape)
print(bbal.shape)
print(card.shape)
print(poca.shape)
print(inst.shape)

(307511, 122)
(48744, 121)
(1670214, 37)
(1716428, 17)
(27299925, 3)
(3840312, 23)
(10001358, 8)
(13605401, 8)


## 2.2. PREPROCESSING

In [5]:
# extract target
y = train["TARGET"]
del train["TARGET"]

In [6]:
# rename basic features
train.columns = ["SK_ID_CURR"] + ["base_" + str(col) for col in train.columns if col not in "SK_ID_CURR"]
test.columns  = ["SK_ID_CURR"] + ["base_" + str(col) for col in test.columns  if col not in "SK_ID_CURR"]

In [7]:
##### CONVERT FACTOR FEATURES

### train and test
factors = [f for f in train.columns if train[f].dtype == "object"]
for f in factors:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])
    
### prev
prev_factors = [f for f in prev.columns if prev[f].dtype == "object"]
for f in prev_factors:
    prev[f], _ = pd.factorize(prev[f])
    
### buro
buro_factors = [f for f in buro.columns if buro[f].dtype == "object"]
for f in buro_factors:
    buro[f], _ = pd.factorize(buro[f])
    
### bbal
bbal_factors = [f for f in bbal.columns if bbal[f].dtype == "object"]
for f in bbal_factors:
    bbal[f], _ = pd.factorize(bbal[f])
    
### card
card_factors = [f for f in card.columns if card[f].dtype == "object"]
for f in card_factors:
    card[f], _ = pd.factorize(card[f])
    
### poca
poca_factors = [f for f in poca.columns if poca[f].dtype == "object"]
for f in poca_factors:
    poca[f], _ = pd.factorize(poca[f])
    
### inst
inst_factors = [f for f in inst.columns if inst[f].dtype == "object"]
for f in inst_factors:
    inst[f], _ = pd.factorize(inst[f])

## 2.3. AGGREGATIONS

In [8]:
##### AGGREGATE FEATURES: MEANS

### prev
avg_prev = prev[list(set(prev.columns) - set(prev_factors))]
avg_prev = avg_prev.groupby("SK_ID_CURR").mean()
cnt_prev = prev[["SK_ID_CURR", "SK_ID_PREV"]].groupby("SK_ID_CURR").count()
avg_prev["app_count"] = cnt_prev["SK_ID_PREV"]
del avg_prev["SK_ID_PREV"]
avg_prev.columns = ["prev_" + str(col) + "_mean" for col in avg_prev.columns]
print(avg_prev.shape)

### card
avg_card = card[list(set(card.columns) - set(card_factors))]
avg_card = avg_card.groupby("SK_ID_CURR").mean()
del avg_card["SK_ID_PREV"]
avg_card.columns = ["card_" + str(col) + "_mean" for col in avg_card.columns]
print(avg_card.shape)

### poca
avg_poca = poca[list(set(poca.columns) - set(poca_factors))]
avg_poca = avg_poca.groupby("SK_ID_CURR").mean()
del avg_poca["SK_ID_PREV"]
avg_poca.columns = ["poca_" + str(col) + "_mean" for col in avg_poca.columns]
print(avg_poca.shape)

### inst
avg_inst = inst[list(set(inst.columns) - set(inst_factors))]
avg_inst = avg_inst.groupby("SK_ID_CURR").mean()
del avg_inst["SK_ID_PREV"]
avg_inst.columns = ["inst_" + str(col) + "_mean" for col in avg_inst.columns]
print(avg_inst.shape)

### buro + bbal
avg_buro = buro[list(set(buro.columns) - set(buro_factors))]
avg_buro = avg_buro.groupby("SK_ID_CURR").mean()
avg_buro["buro_count"] = buro[["SK_ID_BUREAU","SK_ID_CURR"]].groupby("SK_ID_CURR").count()["SK_ID_BUREAU"]
avg_buro["SK_ID_CURR"] = avg_buro.index
avg_bbal = bbal[list(set(bbal.columns) - set(bbal_factors))]
avg_bbal = bbal.groupby("SK_ID_BUREAU").mean()
avg_buro = avg_buro.merge(right = avg_bbal.reset_index(), how = "left", on = "SK_ID_BUREAU")
del avg_buro["SK_ID_BUREAU"]
avg_buro = avg_buro[["SK_ID_CURR"] + [col for col in avg_buro.columns if col not in "SK_ID_CURR"]]
avg_buro.columns = ["SK_ID_CURR"] + ["buro_" + str(col) + "_mean" for col in avg_buro.columns if col not in "SK_ID_CURR"]
print(avg_buro.shape)

(338857, 20)
(103558, 20)
(337252, 5)
(339587, 6)
(305811, 16)


In [9]:
##### AGGREGATE FEATURES: SDS

### prev
std_prev = prev[list(set(prev.columns) - set(prev_factors))]
std_prev = std_prev.groupby("SK_ID_CURR").std()
missings = std_prev.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    std_prev[var] = std_prev[var].fillna(0)
del std_prev["SK_ID_PREV"]
std_prev.columns = ["prev_" + str(col) + "_sd" for col in std_prev.columns]
print(std_prev.shape)

### card
std_card = card[list(set(card.columns) - set(card_factors))]
std_card = std_card.groupby("SK_ID_CURR").std()
missings = std_card.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    std_card[var] = std_card[var].fillna(0)
del std_card["SK_ID_PREV"]
std_card.columns = ["card_" + str(col) + "_sd" for col in std_card.columns]
print(std_card.shape)

### poca
std_poca = poca[list(set(poca.columns) - set(poca_factors))]
std_poca = std_poca.groupby("SK_ID_CURR").std()
missings = std_poca.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    std_poca[var] = std_poca[var].fillna(0)
del std_poca["SK_ID_PREV"]
std_poca.columns = ["poca_" + str(col) + "_sd" for col in std_poca.columns]
print(std_poca.shape)

### inst
std_inst = inst[list(set(inst.columns) - set(inst_factors))]
std_inst = std_inst.groupby("SK_ID_CURR").std()
missings = std_inst.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    std_inst[var] = std_inst[var].fillna(0)
del std_inst["SK_ID_PREV"]
std_inst.columns = ["inst_" + str(col) + "_sd" for col in std_inst.columns]
print(std_inst.shape)

### buro + bbal
std_buro = buro[list(set(buro.columns) - set(buro_factors))]
std_buro = std_buro.groupby("SK_ID_CURR").std()
std_buro["SK_ID_CURR"] = std_buro.index
std_bbal = bbal[list(set(bbal.columns) - set(bbal_factors))]
std_bbal = std_bbal.groupby("SK_ID_BUREAU").std()
std_buro = std_buro.merge(right = std_bbal.reset_index(), how = "left", on = "SK_ID_BUREAU")
del std_buro["SK_ID_BUREAU"]
std_buro = std_buro[["SK_ID_CURR"] + [col for col in std_buro.columns if col not in "SK_ID_CURR"]]
std_buro.columns = ["SK_ID_CURR"] + ["buro_" + str(col) + "_sd" for col in std_buro.columns if col not in "SK_ID_CURR"]
print(std_buro.shape)

(338857, 19)
(103558, 20)
(337252, 5)
(339587, 6)
(305811, 14)


In [10]:
##### AGGREGATE FEATURES: MINS

### prev
min_prev = prev[list(set(prev.columns) - set(prev_factors))]
min_prev = min_prev.groupby("SK_ID_CURR").min()
missings = min_prev.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    min_prev[var] = min_prev[var].fillna(0)
del min_prev["SK_ID_PREV"]
min_prev.columns = ["prev_" + str(col) + "_min" for col in min_prev.columns]
print(min_prev.shape)

### card
min_card = card[list(set(card.columns) - set(card_factors))]
min_card = min_card.groupby("SK_ID_CURR").min()
missings = min_card.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    min_card[var] = min_card[var].fillna(0)
del min_card["SK_ID_PREV"]
min_card.columns = ["card_" + str(col) + "_min" for col in min_card.columns]
print(min_card.shape)

### poca
min_poca = poca[list(set(poca.columns) - set(poca_factors))]
min_poca = min_poca.groupby("SK_ID_CURR").min()
missings = min_poca.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    min_poca[var] = min_poca[var].fillna(0)
del min_poca["SK_ID_PREV"]
min_poca.columns = ["poca_" + str(col) + "_min" for col in min_poca.columns]
print(min_poca.shape)

### inst
min_inst = inst[list(set(inst.columns) - set(inst_factors))]
min_inst = min_inst.groupby("SK_ID_CURR").min()
missings = min_inst.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    min_inst[var] = min_inst[var].fillna(0)
del min_inst["SK_ID_PREV"]
min_inst.columns = ["inst_" + str(col) + "_min" for col in min_inst.columns]
print(min_inst.shape)

### buro + bbal
min_buro = buro[list(set(buro.columns) - set(buro_factors))]
min_buro = min_buro.groupby("SK_ID_CURR").min()
min_buro["SK_ID_CURR"] = min_buro.index
min_bbal = bbal[list(set(bbal.columns) - set(bbal_factors))]
min_bbal = min_bbal.groupby("SK_ID_BUREAU").min()
min_buro = min_buro.merge(right = min_bbal.reset_index(), how = "left", on = "SK_ID_BUREAU")
del min_buro["SK_ID_BUREAU"]
min_buro = min_buro[["SK_ID_CURR"] + [col for col in min_buro.columns if col not in "SK_ID_CURR"]]
min_buro.columns = ["SK_ID_CURR"] + ["buro_" + str(col) + "_min" for col in min_buro.columns if col not in "SK_ID_CURR"]
print(min_buro.shape)

(338857, 19)
(103558, 20)
(337252, 5)
(339587, 6)
(305811, 14)


In [11]:
##### AGGREGATE FEATURES: MAXS

### prev
max_prev = prev[list(set(prev.columns) - set(prev_factors))]
max_prev = max_prev.groupby("SK_ID_CURR").max()
missings = max_prev.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    max_prev[var] = max_prev[var].fillna(0)
del max_prev["SK_ID_PREV"]
max_prev.columns = ["prev_" + str(col) + "_max" for col in max_prev.columns]
print(max_prev.shape)

### card
max_card = card[list(set(card.columns) - set(card_factors))]
max_card = max_card.groupby("SK_ID_CURR").max()
missings = max_card.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    max_card[var] = max_card[var].fillna(0)
del max_card["SK_ID_PREV"]
max_card.columns = ["card_" + str(col) + "_max" for col in max_card.columns]
print(max_card.shape)

### poca
max_poca = poca[list(set(poca.columns) - set(poca_factors))]
max_poca = max_poca.groupby("SK_ID_CURR").max()
missings = max_poca.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    max_poca[var] = max_poca[var].fillna(0)
del max_poca["SK_ID_PREV"]
max_poca.columns = ["poca_" + str(col) + "_max" for col in max_poca.columns]
print(max_poca.shape)

### inst
max_inst = inst[list(set(inst.columns) - set(inst_factors))]
max_inst = max_inst.groupby("SK_ID_CURR").max()
missings = max_inst.isnull().sum()
nan_vars = list(missings[missings > 0].index)
for var in nan_vars:
    max_inst[var] = max_inst[var].fillna(0)
del max_inst["SK_ID_PREV"]
max_inst.columns = ["inst_" + str(col) + "_max" for col in max_inst.columns]
print(max_inst.shape)

### buro + bbal
max_buro = buro[list(set(buro.columns) - set(buro_factors))]
max_buro = max_buro.groupby("SK_ID_CURR").max()
max_buro["SK_ID_CURR"] = max_buro.index
max_bbal = bbal[list(set(bbal.columns) - set(bbal_factors))]
max_bbal = max_bbal.groupby("SK_ID_BUREAU").max()
max_buro = max_buro.merge(right = max_bbal.reset_index(), how = "left", on = "SK_ID_BUREAU")
del max_buro["SK_ID_BUREAU"]
max_buro = max_buro[["SK_ID_CURR"] + [col for col in max_buro.columns if col not in "SK_ID_CURR"]]
max_buro.columns = ["SK_ID_CURR"] + ["buro_" + str(col) + "_max" for col in max_buro.columns if col not in "SK_ID_CURR"]
print(max_buro.shape)

(338857, 19)
(103558, 20)
(337252, 5)
(339587, 6)
(305811, 14)


In [12]:
##### AGGREGATE FEATURES: MODES

### prev
mod_prev = prev[prev_factors + ["SK_ID_CURR"]]
mod_prev = mod_prev.groupby("SK_ID_CURR").agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()
mod_prev.columns = ["SK_ID_CURR"] + ["prev_" + str(col) + "_mode" for col in mod_prev.columns if col not in "SK_ID_CURR"]
print(mod_prev.shape)

### card
mod_card = card[card_factors + ["SK_ID_CURR"]]
mod_card = mod_card.groupby("SK_ID_CURR").agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()
mod_card.columns = ["SK_ID_CURR"] + ["card_" + str(col) + "_mode" for col in mod_card.columns if col not in "SK_ID_CURR"]
print(mod_card.shape)

### poca
mod_poca = poca[poca_factors + ["SK_ID_CURR"]]
mod_poca = mod_poca.groupby("SK_ID_CURR").agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()
mod_poca.columns = ["SK_ID_CURR"] + ["poca_" + str(col) + "_mode" for col in mod_poca.columns if col not in "SK_ID_CURR"]
print(mod_poca.shape)

### inst
mod_inst = inst[inst_factors + ["SK_ID_CURR"]]
mod_inst = mod_inst.groupby("SK_ID_CURR").agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()
mod_inst.columns = ["SK_ID_CURR"] + ["inst_" + str(col) + "_mode" for col in mod_inst.columns if col not in "SK_ID_CURR"]
print(mod_inst.shape)

### buro + bbal
mod_buro = buro[buro_factors + ["SK_ID_CURR", "SK_ID_BUREAU"]]
mod_buro = mod_buro.groupby("SK_ID_CURR").agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()
mod_bbal = bbal[bbal_factors + ["SK_ID_BUREAU"]]
mod_bbal = mod_bbal.groupby("SK_ID_BUREAU").agg(lambda x: scipy.stats.mode(x)[0][0]).reset_index()
mod_buro = mod_buro.merge(right = mod_bbal.reset_index(), how = "left", on = "SK_ID_BUREAU")
del mod_buro["SK_ID_BUREAU"]
mod_buro.columns = ["SK_ID_CURR"] + ["buro_" + str(col) + "_mode" for col in mod_buro.columns if col not in "SK_ID_CURR"]
print(mod_buro.shape)

(338857, 17)
(103558, 2)
(337252, 2)
(339587, 1)
(305811, 6)


## 2.4. MERGING DATA

In [13]:
### DESCRIPTIVE STATS

# prev
all_prev = mod_prev.merge(right = avg_prev.reset_index(), how = "left", on = "SK_ID_CURR")
all_prev = all_prev.merge(right = std_prev.reset_index(), how = "left", on = "SK_ID_CURR")
all_prev = all_prev.merge(right = min_prev.reset_index(), how = "left", on = "SK_ID_CURR")
all_prev = all_prev.merge(right = max_prev.reset_index(), how = "left", on = "SK_ID_CURR")

# card
all_card = mod_card.merge(right = avg_card.reset_index(), how = "left", on = "SK_ID_CURR")
all_card = all_card.merge(right = std_card.reset_index(), how = "left", on = "SK_ID_CURR")
all_card = all_card.merge(right = min_card.reset_index(), how = "left", on = "SK_ID_CURR")
all_card = all_card.merge(right = max_card.reset_index(), how = "left", on = "SK_ID_CURR")

# poca
all_poca = mod_poca.merge(right = avg_poca.reset_index(), how = "left", on = "SK_ID_CURR")
all_poca = all_poca.merge(right = std_poca.reset_index(), how = "left", on = "SK_ID_CURR")
all_poca = all_poca.merge(right = min_poca.reset_index(), how = "left", on = "SK_ID_CURR")
all_poca = all_poca.merge(right = max_poca.reset_index(), how = "left", on = "SK_ID_CURR")

# inst
all_inst = mod_inst.merge(right = avg_inst.reset_index(), how = "left", on = "SK_ID_CURR")
all_inst = all_inst.merge(right = std_inst.reset_index(), how = "left", on = "SK_ID_CURR")
all_inst = all_inst.merge(right = min_inst.reset_index(), how = "left", on = "SK_ID_CURR")
all_inst = all_inst.merge(right = max_inst.reset_index(), how = "left", on = "SK_ID_CURR")

# buro
all_buro = mod_buro.merge(right = avg_buro, how = "left", on = "SK_ID_CURR")
all_buro = all_buro.merge(right = std_buro, how = "left", on = "SK_ID_CURR")
all_buro = all_buro.merge(right = min_buro, how = "left", on = "SK_ID_CURR")
all_buro = all_buro.merge(right = max_buro, how = "left", on = "SK_ID_CURR")

In [14]:
### TRAIN AND TEST

# train
train = train.merge(right = all_prev, how = "left", on = "SK_ID_CURR")
train = train.merge(right = all_card, how = "left", on = "SK_ID_CURR")
train = train.merge(right = all_poca, how = "left", on = "SK_ID_CURR")
train = train.merge(right = all_inst, how = "left", on = "SK_ID_CURR")
train = train.merge(right = all_buro, how = "left", on = "SK_ID_CURR")

# test
test = test.merge(right = all_prev, how = "left", on = "SK_ID_CURR")
test = test.merge(right = all_card, how = "left", on = "SK_ID_CURR")
test = test.merge(right = all_poca, how = "left", on = "SK_ID_CURR")
test = test.merge(right = all_inst, how = "left", on = "SK_ID_CURR")
test = test.merge(right = all_buro, how = "left", on = "SK_ID_CURR")

## 2.5. EXPORT

In [15]:
# check dimensions
print(train.shape)
print(test.shape)

(307511, 399)
(48744, 399)


In [16]:
# export CSV
train.to_csv("../data/prepared/train.csv", index = False, float_format = "%.8f")
test.to_csv("../data/prepared/test.csv",   index = False, float_format = "%.8f")
y.to_csv("../data/prepared/y.csv",         index = False, float_format = "%.8f")