# Librairies

In [2]:
from preprocessing import *
import os, joblib

# Data

In [3]:
STORAGE_PATH = '../data'
INPUT_PATH = os.path.join(STORAGE_PATH, 'Projet+Mise+en+prod+-+home-credit-default-risk')

In [4]:
files = os.listdir(STORAGE_PATH)
print(f'File list\n{36*"-"}',)
for f in files:
    print('-', f)

File list
------------------------------------
- preprocessed_data
- data_train.f
- .DS_Store
- Projet+Mise+en+prod+-+home-credit-default-risk
- data_feature_selection.f
- data.f
- data_sample.csv
- data_sample_train.f
- X_sample.f
- data_train_sample.f
- data_sample.f


# Data for dashboard

In [5]:
description = pd.read_csv(os.path.join(INPUT_PATH, 'HomeCredit_columns_description.csv'),
                          encoding='latin1', index_col='Unnamed: 0')

In [6]:
application = pd.read_csv(os.path.join(INPUT_PATH, 'application_train.csv'))

In [49]:
feature_dict = {} 
for i, c in enumerate(sorted(application.columns)):
    t = application[c].dtype.name
    d = description.loc[description.Row == c, 'Description'].iloc[0]
    feature_dict[i] = {'name': c, 'type': t, 'description': d}

In [50]:
application.to_feather('../api/inputs/application.f')
joblib.dump(feature_dict, '../api/inputs/feature_dict.p')

['../api/inputs/feature_dict.p']

# Data for model

## Preprocessing

In [15]:
OUTPUT_PATH = os.path.join(STORAGE_PATH, 'preprocess_input')

In [5]:
bureau_agg = preprocess_bureau_and_balance()
bureau_agg.to_csv(os.path.join(OUTPUT_PATH, 'bureau_agg.csv'))
del bureau_agg
gc.collect()

In [7]:
prev_agg = preprocess_previous_applications()
prev_agg.to_csv(os.path.join(OUTPUT_PATH, 'prev_agg.csv'))
del prev_agg
gc.collect()

In [8]:
pos_agg = preprocess_pos_cash()
pos_agg.to_csv(os.path.join(OUTPUT_PATH, 'pos_agg.csv'))
del pos_agg
gc.collect()

In [12]:
ins_agg = preprocess_installments_payments()
ins_agg.to_csv(os.path.join(OUTPUT_PATH, 'ins_agg.csv'))
del ins_agg
gc.collect()

In [11]:
cc_agg = preprocess_credit_card_balance()
cc_agg.to_csv(os.path.join(OUTPUT_PATH, 'cc_agg.csv'))
del cc_agg
gc.collect()

In [20]:
print("ins_agg:", ins_agg.shape)
print("cc_agg:", cc_agg.shape)
print("pos_agg:", pos_agg.shape)
print("prev_agg:", prev_agg.shape)
print("bureau_agg:", bureau_agg.shape)

ins_agg: (339587, 26)
cc_agg: (103558, 141)
pos_agg: (337252, 18)
prev_agg: (338857, 249)
bureau_agg: (305811, 116)


In [3]:
application_train = pd.read_csv(os.path.join(STORAGE_PATH, 'application_train.csv'))
application_train_preprocess = preprocess_application(application_train)

## Merge files 

In [5]:
data = merge_preprocess_files(application_train_preprocess)

Shape after merge with bureau_agg: (307507, 362)
Shape after merge with prev_agg: (307507, 611)
Shape after merge with pos_agg: (307507, 629)
Shape after merge with ins_agg: (307507, 655)
Shape after merge with cc_agg: (307507, 796)


## Save

In [4]:
data.to_feather(os.path.join(STORAGE_PATH, 'data.f'))