In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# install dependencies
!pip install dask
!pip install featuretools
!pip install -U -q PyDrive

from IPython.display import clear_output
clear_output()

In [None]:
# clean workspace
!rm -rf top_features
!rm -rf data
!rm -rf partitioned_data
!rm data.zip
!rm -rf __MACOSX
!ls

rm: cannot remove 'data.zip': No such file or directory
drive  sample_data


In [None]:
import pandas as pd
import numpy as np
import os
from random import sample
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')
import featuretools as ft
ft.__version__

'0.26.2'

In [None]:
path = '/content/drive/MyDrive/Notebook/data_scientist/P7/'
list_files = os.listdir(path)

In [None]:
print('Reading in data')

# Read in the full datasets
app_train = pd.read_csv(path + 'application_train.csv')
bureau = pd.read_csv(path + 'bureau.csv')
bureau_balance = pd.read_csv(path + 'bureau_balance.csv')
cash = pd.read_csv(path + 'POS_CASH_balance.csv')
credit = pd.read_csv(path + 'credit_card_balance.csv')
previous = pd.read_csv(path + 'previous_application.csv')
installments = pd.read_csv(path + 'installments_payments.csv')

# Join the application dataframes together
app_train['set'] = 'train'

# Append the dataframes (this is a row bind in R)
app = app_train

# Create the entity set with an id
es = ft.EntitySet(id = 'applications')

# Add in all the entities

# Entities with a unique index
es = es.entity_from_dataframe(entity_id = 'app', dataframe = app, index = 'SK_ID_CURR')

es = es.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')

es = es.entity_from_dataframe(entity_id = 'previous', dataframe = previous, index = 'SK_ID_PREV')

# Entities that do not have a unique index
es = es.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
                              make_index = True, index = 'bb_index')

es = es.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
                              make_index = True, index = 'cash_index')

es = es.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
                              make_index = True, index = 'in_index')

es = es.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
                              make_index = True, index = 'credit_index')


# Relationship between app and bureau
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

# Relationship between bureau and bureau balance
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

# Relationship between current app and previous apps
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])

# Relationships between previous apps and cash, installments, and credit
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])

# Add in the defined relationships
es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                           r_previous_cash, r_previous_installments, r_previous_credit])
                           
print(es)
                           
print('Clearing up memory')

gc.enable()
# Clear up memory
del app, bureau, bureau_balance, cash, credit, installments, previous
gc.collect()

print('Deep Feature Synthesis in Progress')

# Default primitives from featuretools
default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
default_trans_primitives =  ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"]

# DFS for application features using a max depth of 1
feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'app',
                       trans_primitives = default_trans_primitives,
                       agg_primitives=default_agg_primitives, 
                       max_depth = 1, features_only=False, verbose = True)
                       
# Reset the index to make SK_ID_CURR a column again                                      
feature_matrix = feature_matrix.reset_index()

print('Saving features')
feature_matrix.to_csv('/content/drive/MyDrive/Notebook/data_scientist/P7/feature_matrix.csv', index = False)

Reading in data
Entityset: applications
  Entities:
    app [Rows: 307511, Columns: 123]
    bureau [Rows: 1716428, Columns: 17]
    previous [Rows: 1670214, Columns: 37]
    bureau_balance [Rows: 27299925, Columns: 4]
    cash [Rows: 10001358, Columns: 9]
    installments [Rows: 13605401, Columns: 9]
    credit [Rows: 3840312, Columns: 24]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU
    previous.SK_ID_CURR -> app.SK_ID_CURR
    cash.SK_ID_PREV -> previous.SK_ID_PREV
    installments.SK_ID_PREV -> previous.SK_ID_PREV
    credit.SK_ID_PREV -> previous.SK_ID_PREV
Clearing up memory
Deep Feature Synthesis in Progress
Built 348 features
Elapsed: 36:20 | Progress: 100%|██████████
Saving features
