In [2]:
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft

# matplotlit and seaborn for visualizations
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

In [2]:
app_train = pd.read_csv('../Data/application_train.csv').sort_values('SK_ID_CURR').reset_index(drop = True)
app_test = pd.read_csv('../Data/application_test.csv').sort_values('SK_ID_CURR').reset_index(drop = True)
bureau = pd.read_csv('../Data/bureau.csv').sort_values(['SK_ID_CURR', 'SK_ID_BUREAU']).reset_index(drop = True)
bureau_balance = pd.read_csv('../Data/bureau_balance.csv').sort_values('SK_ID_BUREAU').reset_index(drop = True)
cash = pd.read_csv('../Data/POS_CASH_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)
credit = pd.read_csv('../Data/credit_card_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)
previous = pd.read_csv('../Data/previous_application.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)
installments = pd.read_csv('../Data/installments_payments.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)


In [1]:
app_train.desc()

NameError: name 'app_train' is not defined

In [3]:
app_train['set'] = 'train'
app_test['set'] = 'test'
app_test["TARGET"] = np.nan

# Append the dataframes
app = app_train.append(app_test, ignore_index = True)

In [4]:
# Entity set with id applications
es = ft.EntitySet(id = 'clients')

In [5]:
# Entities with a unique index
es = es.entity_from_dataframe(entity_id = 'app', dataframe = app, index = 'SK_ID_CURR')

es = es.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')

es = es.entity_from_dataframe(entity_id = 'previous', dataframe = previous, index = 'SK_ID_PREV')

# Entities that do not have a unique index
es = es.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
                              make_index = True, index = 'bureaubalance_index')

es = es.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
                              make_index = True, index = 'cash_index')

es = es.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
                              make_index = True, index = 'installments_index')

es = es.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
                              make_index = True, index = 'credit_index')

In [6]:
# Relationship between app and bureau
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

# Relationship between bureau and bureau balance
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

# Relationship between current app and previous apps
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])

# Relationships between previous apps and cash, installments, and credit
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])

In [7]:
# Add in the defined relationships
es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                           r_previous_cash, r_previous_installments, r_previous_credit])
# Print out the EntitySet
es

Entityset: clients
  Entities:
    app [Rows: 356255, Columns: 123]
    bureau [Rows: 1716428, Columns: 17]
    previous [Rows: 1670214, Columns: 37]
    bureau_balance [Rows: 27299925, Columns: 4]
    cash [Rows: 10001358, Columns: 9]
    installments [Rows: 13605401, Columns: 9]
    credit [Rows: 3840312, Columns: 24]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU
    previous.SK_ID_CURR -> app.SK_ID_CURR
    cash.SK_ID_PREV -> previous.SK_ID_PREV
    installments.SK_ID_PREV -> previous.SK_ID_PREV
    credit.SK_ID_PREV -> previous.SK_ID_PREV

In [8]:
# List the primitives in a dataframe
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(10)

Unnamed: 0,name,type,description
0,avg_time_between,aggregation,Computes the average time between consecutive events.
1,median,aggregation,Finds the median value of any feature with well-ordered values.
2,count,aggregation,Counts the number of non null values.
3,mean,aggregation,Computes the average value of a numeric feature.
4,time_since_last,aggregation,Time since last related instance.
5,n_most_common,aggregation,Finds the N most common elements in a categorical feature.
6,all,aggregation,Test if all values are 'True'.
7,sum,aggregation,Sums elements of a numeric or boolean feature.
8,time_since_first,aggregation,Time since first related instance.
9,percent_true,aggregation,Finds the percent of 'True' values in a boolean feature.


In [9]:
primitives[primitives['type'] == 'transform'].head(10)

Unnamed: 0,name,type,description
20,days_since,transform,"For each value of the base feature, compute the number of days between it"
21,modulo_by_feature,transform,
22,not_equal,transform,
23,is_weekend,transform,Transform Datetime feature into the boolean of Weekend.
24,num_words,transform,Returns the number of words in a given string by counting the spaces.
25,divide_by_feature,transform,
26,equal,transform,
27,modulo_numeric,transform,
28,month,transform,"Transform a Datetime feature into the ""month."
29,cum_max,transform,Returns the cumulative max after grouping


In [10]:
# Default primitives from featuretools
default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
default_trans_primitives =  ["day", "year", "month", "weekday", "haversine"]

# DFS with specified primitives
feature_names = ft.dfs(entityset = es, target_entity = 'app',
                       trans_primitives = default_trans_primitives,
                       agg_primitives = default_agg_primitives, 
                       max_depth = 2, features_only=True)

print('%d Total Features' % len(feature_names))

1697 Total Features


In [None]:
feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'app',
                       trans_primitives = default_trans_primitives,
                       agg_primitives=default_agg_primitives, 
                       max_depth = 2, features_only=False, verbose = True)
                       


Built 1697 features
Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | Calculated: 0/11 chunks

In [None]:
feature_matrix.head()

In [None]:
# Reset the index to make SK_ID_CURR a column again                                      
feature_matrix = feature_matrix.reset_index()