## Dataset and approach:
Data is from Kaggle competiotion [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk). 

I implement an automated feature engineering approach with an open-source library [Featuretools](https://www.featuretools.com/). 


In [2]:
import pandas as pd
import numpy as np

In [3]:
import featuretools as ft

In [4]:
# Read in the datasets  
app_train = pd.read_csv('../../Home_Credit_data/data/application_train.csv', sep=',')
app_test = pd.read_csv('../../Home_Credit_data/data/application_test.csv')
bureau = pd.read_csv('../../Home_Credit_data/data/bureau.csv')
bureau_balance = pd.read_csv('../../Home_Credit_data/data/bureau_balance.csv')
cash = pd.read_csv('../../Home_Credit_data/data/POS_CASH_balance.csv')
credit = pd.read_csv('../../Home_Credit_data/data/credit_card_balance.csv')
previous = pd.read_csv('../../Home_Credit_data/data/previous_application.csv')
installments = pd.read_csv('../../Home_Credit_data/data/installments_payments.csv')

![](../images/home_credit_data.png)

In [23]:
datasets_list = [app_train, app_test, bureau, bureau_balance, cash, credit, previous, installments]

app_train.name = 'app_train'
app_test.name = 'app_test'
bureau.name = 'bureau'
bureau_balance.name = 'bureau_balance'
cash.name = 'cash'
credit.name = 'credit'
previous.name = 'previous'
installments.name = 'installments'

In [40]:
for ds in datasets_list:
    print('{}\t - \t{} rows'.format(ds.name , ds.iloc[:, 0].count()))

app_train	 - 	307510 rows
app_test	 - 	48744 rows
bureau	 - 	1716420 rows
bureau_balance	 - 	27299925 rows
cash	 - 	10001358 rows
credit	 - 	3840312 rows
previous	 - 	1670214 rows
installments	 - 	13605401 rows


In [6]:
# replace the anomalous values
for ds in datasets_list:
    ds.replace({365243: np.nan}, inplace=True)

In [7]:
# Join train and test set to make sure, that the same feature are created for each set. 
# Later it will be separated.

In [8]:
app_test['TARGET'] = np.nan
app = app_train.append(app_test, ignore_index=True)

### Featuretools

In [9]:
# Entity set to keep track of all the data
es = ft.EntitySet(id = 'clients')

#### Variable Types

In [10]:
import featuretools.variable_types as vtypes

In [11]:
app_types = {}

In [12]:
# Boolean variables

for col in app.columns:
    if (app[col].nunique() == 2) and (app[col].dtype == float):
        app_types[col] = vtypes.Boolean
        
del app_types['TARGET']

print('Number of Boolean variables: {}'.format(len(app_types)))

Number of Boolean variables: 32


In [13]:
# Ordinal variables
app_types['REGION_RATING_CLIENT'] = vtypes.Ordinal
app_types['REGION_RATING_CLIENT_W_CITY'] = vtypes.Ordinal
app_types['HOUR_APPR_PROCESS_START'] = vtypes.Ordinal

In [14]:
previous_types = {}

for col in previous.columns:
    if ( previous[col].nunique() == 2) and (previous[col].dtype == float):
        previous_types[col] = vtypes.Boolean
        
print('Number of Boolean variables: {}'.format(len(previous_types)))

Number of Boolean variables: 2


Drop `SK_ID_CURR` in `installments`, `credit`, `cash` because I will link to these dataset through `previous` and `SK_ID_PREV`.

To avoid `featuretools` to create useless statistical aggregations of ids.

In [15]:
installments = installments.drop(columns = ['SK_ID_CURR'])
credit = credit.drop(columns = ['SK_ID_CURR'])
cash = cash.drop(columns = ['SK_ID_CURR'])

In [16]:
# Add Entities to EntitySet

es = es.entity_from_dataframe(entity_id = 'app', dataframe = app, index = 'SK_ID_CURR', variable_types=app_types)
es = es.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')
es = es.entity_from_dataframe(entity_id = 'previous', dataframe = previous, index = 'SK_ID_PREV', variable_types= previous_types )


In [75]:
## Entities without unique index. We need to add.
#es = es.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
#                              make_index = True, index = 'bureaubalance_index')

#es = es.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
#                              make_index = True, index = 'cash_index')

#es = es.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
#                              make_index = True, index = 'installments_index')

#es = es.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
#                              make_index = True, index = 'credit_index')

In [45]:
# Define relationship
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

#r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])
#r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
#r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
#r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])

In [76]:
# Add relationships to EntitySet

In [46]:
es = es.add_relationships([r_app_bureau, r_app_previous])
#es = es.add_relationships([r_bureau_balance,  r_previous_cash, r_previous_installments, r_previous_credit])


In [47]:
es

Entityset: clients
  Entities:
    app [Rows: 356255, Columns: 122]
    bureau [Rows: 1716428, Columns: 17]
    previous [Rows: 1670214, Columns: 37]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    previous.SK_ID_CURR -> app.SK_ID_CURR

#### Feature primitives

In [41]:
primitives = ft.list_primitives()

In [49]:
primitives.head(5)

Unnamed: 0,name,type,description
0,max,aggregation,Finds the maximum non-null value of a numeric ...
1,num_unique,aggregation,Returns the number of unique categorical varia...
2,last,aggregation,Returns the last value.
3,min,aggregation,Finds the minimum non-null value of a numeric ...
4,time_since_last,aggregation,Time since last related instance.


In [None]:
# feature_names = ft.dfs(entityset = es, target_entity = 'app',
#                        trans_primitives = default_trans_primitives,
#                        agg_primitives=default_agg_primitives, 
#                        where_primitives = [], seed_features = [],
#                        max_depth = 2, n_jobs = -1, verbose = 1,
#                        features_only=True)

In [51]:
# Define default premitives
default_agg_features = ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
default_trans_features = ["day", "year", "month", "weekday", "haversine", "numwords", "characters"]

#Deep Feature Synthesis for app

feature_names = 
