# Automated Feature Engineering
#### See https://www.kaggle.com/willkoehrsen/automated-feature-engineering-tutorial

In [None]:
# Required libraries
import pandas as pd
import numpy as np
import featuretools as ft
import warnings
warnings.filterwarnings('ignore')

In [None]:
# So we have the entities and 2 types of transaction
# clients -< loans -< payments

clients = pd.read_csv('../input/clients.csv',parse_dates=['joined'])
loans = pd.read_csv('../input/loans.csv',parse_dates=['loan_start','loan_end'])
payments = pd.read_csv('../input/payments.csv',parse_dates=['payment_date'])

In [3]:
# Rollup payments to loan_id then merge into loans
stats = payments.groupby('loan_id')['payment_amount','missed'].agg(['sum'])
stats.columns = ['payment','missed']

loans2 = loans.merge(stats, left_on='loan_id', right_index=True, how='left')
loans2.head()

Unnamed: 0,client_id,loan_type,loan_amount,repaid,loan_id,loan_start,loan_end,rate,payment,missed
0,46109,home,13672,0,10243,2002-04-16,2003-12-20,2.15,14008,4
1,46109,credit,9794,0,10984,2003-10-21,2005-07-17,1.25,7791,3
2,46109,home,12734,1,10990,2006-02-01,2007-07-05,0.68,12178,2
3,46109,cash,12518,1,10596,2010-12-08,2013-05-05,1.24,7681,4
4,46109,credit,14049,1,11415,2010-07-07,2012-05-21,3.13,16052,4


In [4]:
# Now rollup to client id and merge into clients

stats = loans2.groupby('client_id')['loan_amount','payment','missed'].agg(['sum'])
stats.columns = ['loan_amount','payment','missed']
clients2 = clients.merge(stats, left_on='client_id', right_index=True, how='left')

# Then add some features manually
clients2['join_month'] = clients2['joined'].dt.month
clients2['log_income'] = np.log(clients2['income'])
clients2.head()
# 5 new features with 7 lines of code

Unnamed: 0,client_id,joined,income,credit_score,loan_amount,payment,missed,join_month,log_income
0,46109,2002-04-16,172677,527,179032,193954,68,4,12.059178
1,49545,2007-11-14,104564,770,205786,282907,111,11,11.557555
2,41480,2013-03-11,122607,585,157897,188223,68,3,11.716739
3,46180,2001-11-06,43851,562,154017,176796,62,11,10.688553
4,25707,2006-10-06,211422,621,159279,189747,85,10,12.261611


## OK Let's use featuretools

In [5]:
# An ft entity is simply a data-frame. And ft uses sets of them - an entityset!
# Basically we're creating metadata. NB payments has no payment_id, so create one.
es = ft.EntitySet(id = 'myentityset')
es = es.entity_from_dataframe(entity_id = 'clients', 
                              dataframe=clients, 
                              index='client_id',
                              time_index='joined')

es = es.entity_from_dataframe(entity_id = 'loans', 
                              dataframe=loans, 
                              variable_types = {'repaid': ft.variable_types.Categorical},
                              index='loan_id',
                              time_index='loan_start')

es = es.entity_from_dataframe(entity_id = 'payments', 
                              dataframe=payments, 
                              variable_types = {'missed': ft.variable_types.Categorical},
                              make_index = True,
                              index='payment_id',
                              time_index='payment_date')

In [6]:
# Right, lets have a look. Its just metadata in a dict
es['loans']

Entity: loans
  Variables:
    loan_id (dtype: index)
    client_id (dtype: numeric)
    loan_type (dtype: categorical)
    loan_amount (dtype: numeric)
    loan_start (dtype: datetime_time_index)
    loan_end (dtype: datetime)
    rate (dtype: numeric)
    repaid (dtype: categorical)
  Shape:
    (Rows: 443, Columns: 8)

In [7]:
# Now we have to tell ft how these entities are related
# Uses the parent-child metaphor. Create the relationships then add to  the entity set
r_c_l = ft.Relationship(es['clients']['client_id'],
                        es['loans']['client_id'])
r_l_p = ft.Relationship(es['loans']['loan_id'],
                        es['payments']['loan_id'])
es = es.add_relationship(r_c_l)
es = es.add_relationship(r_l_p)

In [8]:
# Another look at the whole lot
es

Entityset: myentityset
  Entities:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    loans.client_id -> clients.client_id
    payments.loan_id -> loans.loan_id

In [16]:
# Ok Now lets make some features. First some primitives
# Either aggregations or transformations
primitives = ft.list_primitives()
pd.options.display.max_colwidth=100
primitives[primitives['type']=='aggregation'].head(20)

Unnamed: 0,name,type,description
0,median,aggregation,Finds the median value of any feature with well-ordered values.
1,time_since_last,aggregation,Time since last related instance.
2,avg_time_between,aggregation,Computes the average time between consecutive events.
3,num_true,aggregation,Finds the number of 'True' values in a boolean.
4,all,aggregation,Test if all values are 'True'.
5,mode,aggregation,Finds the most common element in a categorical feature.
6,n_most_common,aggregation,Finds the N most common elements in a categorical feature.
7,num_unique,aggregation,Returns the number of unique categorical variables.
8,any,aggregation,Test if any value is 'True'.
9,mean,aggregation,Computes the average value of a numeric feature.


In [17]:
# And
primitives[primitives['type']=='transform'].head(20)

Unnamed: 0,name,type,description
19,isin,transform,"For each value of the base feature, checks whether it is in a provided list."
20,time_since_previous,transform,Compute the time since the previous instance.
21,years,transform,Transform a Timedelta feature into the number of years.
22,mod,transform,Creates a transform feature that divides two features.
23,months,transform,Transform a Timedelta feature into the number of months.
24,multiply,transform,Creates a transform feature that multplies two features.
25,weeks,transform,Transform a Timedelta feature into the number of weeks.
26,and,transform,"For two boolean values, determine if both values are 'True'."
27,add,transform,Creates a transform feature that adds two features.
28,second,transform,Transform a Datetime feature into the second.


In [18]:
# Ok lets do it! Make some features for the clients
features, feature_names = ft.dfs(entityset = es,
                                 target_entity='clients',
                                 agg_primitives=['median','mean','std','max','percent_true','last','time_since_last'],
                                 trans_primitives=['years','month','divide'])

In [20]:
# Wow! 408  features in 4 lines of code (140 new ones)
# I'm already sold on this!
len(feature_names)

408

In [21]:
feature_names

[<Feature: income>,
 <Feature: credit_score>,
 <Feature: MEDIAN(loans.loan_amount)>,
 <Feature: MEDIAN(loans.rate)>,
 <Feature: MEAN(loans.loan_amount)>,
 <Feature: MEAN(loans.rate)>,
 <Feature: STD(loans.loan_amount)>,
 <Feature: STD(loans.rate)>,
 <Feature: MAX(loans.loan_amount)>,
 <Feature: MAX(loans.rate)>,
 <Feature: LAST(loans.loan_type)>,
 <Feature: LAST(loans.loan_amount)>,
 <Feature: LAST(loans.rate)>,
 <Feature: LAST(loans.repaid)>,
 <Feature: TIME_SINCE_LAST(loans.loan_start)>,
 <Feature: MEDIAN(payments.payment_amount)>,
 <Feature: MEAN(payments.payment_amount)>,
 <Feature: STD(payments.payment_amount)>,
 <Feature: MAX(payments.payment_amount)>,
 <Feature: LAST(payments.payment_amount)>,
 <Feature: LAST(payments.missed)>,
 <Feature: TIME_SINCE_LAST(payments.payment_date)>,
 <Feature: MONTH(joined)>,
 <Feature: income / credit_score>,
 <Feature: credit_score / income>,
 <Feature: MEDIAN(loans.MEDIAN(payments.payment_amount))>,
 <Feature: MEDIAN(loans.MEAN(payments.payment_a

In [13]:
# Some interesting column names!
# Here is a feature of depth=1 ie only 1 layer of primitives has been used
pd.DataFrame(features['MEAN(payments.payment_amount)'].head())

Unnamed: 0_level_0,MEAN(payments.payment_amount)
client_id,Unnamed: 1_level_1
25707,1178.552795
26326,1166.736842
26695,1207.433824
26945,1109.473214
29841,1439.433333


In [14]:
# A feature of depth=2
pd.DataFrame(features['LAST(loans.MEAN(payments.payment_amount))'].head())

Unnamed: 0_level_0,LAST(loans.MEAN(payments.payment_amount))
client_id,Unnamed: 1_level_1
25707,293.5
26326,977.375
26695,1769.166667
26945,1598.666667
29841,1125.5


In [27]:
# Number of features  increases to 3530 with depth=3
features, feature_names = ft.dfs(entityset = es,
                                 target_entity='clients',
                                 agg_primitives=['median','mean','std','max','percent_true','last','time_since_last'],
                                 trans_primitives=['years','month','divide'],
                                 max_depth=3)

In [28]:
len(feature_names)

3530

In [29]:
feature_names

[<Feature: income>,
 <Feature: credit_score>,
 <Feature: MEDIAN(loans.loan_amount)>,
 <Feature: MEDIAN(loans.rate)>,
 <Feature: MEAN(loans.loan_amount)>,
 <Feature: MEAN(loans.rate)>,
 <Feature: STD(loans.loan_amount)>,
 <Feature: STD(loans.rate)>,
 <Feature: MAX(loans.loan_amount)>,
 <Feature: MAX(loans.rate)>,
 <Feature: LAST(loans.loan_type)>,
 <Feature: LAST(loans.loan_amount)>,
 <Feature: LAST(loans.rate)>,
 <Feature: LAST(loans.repaid)>,
 <Feature: TIME_SINCE_LAST(loans.loan_start)>,
 <Feature: MEDIAN(payments.payment_amount)>,
 <Feature: MEAN(payments.payment_amount)>,
 <Feature: STD(payments.payment_amount)>,
 <Feature: MAX(payments.payment_amount)>,
 <Feature: LAST(payments.payment_amount)>,
 <Feature: LAST(payments.missed)>,
 <Feature: TIME_SINCE_LAST(payments.payment_date)>,
 <Feature: MONTH(joined)>,
 <Feature: income / credit_score>,
 <Feature: credit_score / income>,
 <Feature: MEDIAN(loans.MEDIAN(payments.payment_amount))>,
 <Feature: MEDIAN(loans.MEAN(payments.payment_a