# Titanicのデータでfeaturetoolsを試す

In [1]:
import featuretools
import pandas as pd

In [2]:
train = pd.read_csv('../input/train.csv')
train = train.drop('Survived', 1)

# EntitySetの準備をする

In [3]:
es: featuretools.entityset.entityset.EntitySet

es = featuretools.EntitySet(id='data')

es.entity_from_dataframe(entity_id='train',
                         dataframe=train,
                         index='PassengerId')

Entityset: data
  Entities:
    train [Rows: 891, Columns: 11]
  Relationships:
    No relationships

# 利用可能なPrimitiveを見る

In [4]:
p = featuretools.list_primitives()

In [5]:
p[p['type'] == 'transform']

Unnamed: 0,name,type,description
20,time_since,transform,Calculates time from a value to a specified cu...
21,divide_by_feature,transform,Divide a scalar by each value in the list.
22,day,transform,Determines the day of the month from a datetime.
23,greater_than,transform,Determines if values in one list are greater t...
24,less_than,transform,Determines if values in one list are less than...
25,haversine,transform,Calculates the approximate haversine distance ...
26,add_numeric_scalar,transform,Add a scalar to each value in the list.
27,num_characters,transform,Calculates the number of characters in a string.
28,time_since_previous,transform,Compute the time since the previous entry in a...
29,cum_min,transform,Calculates the cumulative minimum.


In [6]:
# いじりやすいようにソートした
p[p['type'] == 'transform'].sort_values('name').name.values

array(['absolute', 'add_numeric', 'add_numeric_scalar', 'and',
       'cum_count', 'cum_max', 'cum_mean', 'cum_min', 'cum_sum', 'day',
       'days_since', 'diff', 'divide_by_feature', 'divide_numeric',
       'divide_numeric_scalar', 'equal', 'equal_scalar', 'greater_than',
       'greater_than_equal_to', 'greater_than_equal_to_scalar',
       'greater_than_scalar', 'haversine', 'hour', 'is_null',
       'is_weekend', 'isin', 'latitude', 'less_than',
       'less_than_equal_to', 'less_than_equal_to_scalar',
       'less_than_scalar', 'longitude', 'minute', 'modulo_by_feature',
       'modulo_numeric', 'modulo_numeric_scalar', 'month',
       'multiply_numeric', 'multiply_numeric_scalar', 'negate', 'not',
       'not_equal', 'not_equal_scalar', 'num_characters', 'num_words',
       'or', 'percentile', 'scalar_subtract_numeric_feature', 'second',
       'subtract_numeric', 'subtract_numeric_scalar', 'time_since',
       'time_since_previous', 'week', 'weekday', 'year'], dtype=object)

# どんな特徴量が生成されるかを試す

In [7]:
# コメント解除したりして試せるよ

trans = [
#'absolute',
 'add_numeric',
# 'add_numeric_scalar',
# 'and', # いれても意味無い
# 'cum_count',
# 'cum_max',
# 'cum_mean',
# 'cum_min',
# 'cum_sum',
# 'day',
# 'days_since',
# 'diff',
# 'divide_by_feature',
# 'divide_numeric',
# 'divide_numeric_scalar',
# 'equal',
# 'equal_scalar',
# 'greater_than',
# 'greater_than_equal_to',
# 'greater_than_equal_to_scalar',
# 'greater_than_scalar',
# 'haversine',
# 'hour',
# 'is_null',
# 'is_weekend',
# 'isin',
# 'latitude',
# 'less_than',
# 'less_than_equal_to',
# 'less_than_equal_to_scalar',
# 'less_than_scalar',
# 'longitude',
# 'minute',
# 'modulo_by_feature',
# 'modulo_numeric',
# 'modulo_numeric_scalar',
# 'month',
# 'multiply_numeric',
# 'multiply_numeric_scalar',
# 'negate',
# 'not',
# 'not_equal',
# 'not_equal_scalar',
# 'num_characters',
# 'num_words',
# 'or',
# 'percentile',
# 'scalar_subtract_numeric_feature',
# 'second',
# 'subtract_numeric',
# 'subtract_numeric_scalar',
# 'time_since',
# 'time_since_previous',
# 'week',
# 'weekday',
# 'year'
]


feature_names = featuretools.dfs(entityset=es,
                                 target_entity='train',
                                 verbose=1,
                                 agg_primitives=[],
                                 trans_primitives=trans,
                                 features_only=True,
                                 max_depth=1
                                 )
feature_names

Built 20 features


[<Feature: Pclass>,
 <Feature: Name>,
 <Feature: Sex>,
 <Feature: Age>,
 <Feature: SibSp>,
 <Feature: Parch>,
 <Feature: Ticket>,
 <Feature: Fare>,
 <Feature: Cabin>,
 <Feature: Embarked>,
 <Feature: Pclass + SibSp>,
 <Feature: Age + Fare>,
 <Feature: Fare + Pclass>,
 <Feature: Parch + Pclass>,
 <Feature: Age + Pclass>,
 <Feature: Fare + Parch>,
 <Feature: Parch + SibSp>,
 <Feature: Age + Parch>,
 <Feature: Age + SibSp>,
 <Feature: Fare + SibSp>]

# 実際に特徴量を作って、処理時間と結果を見る

In [8]:
df, fs = featuretools.dfs(entityset=es,
                         target_entity='train',
                         verbose=1,
                         agg_primitives=[],
                         trans_primitives=trans,
                         features_only=False,
                         max_depth=1
                         )

Built 20 features
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [9]:
df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass + SibSp,Age + Fare,Fare + Pclass,Parch + Pclass,Age + Pclass,Fare + Parch,Parch + SibSp,Age + Parch,Age + SibSp,Fare + SibSp
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,4,29.25,10.25,3,25.0,7.25,1,22.0,23.0,8.25
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,109.2833,72.2833,1,39.0,71.2833,1,38.0,39.0,72.2833
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,3,33.925,10.925,3,29.0,7.925,0,26.0,26.0,7.925
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,88.1,54.1,1,36.0,53.1,1,35.0,36.0,54.1
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,3,43.05,11.05,3,38.0,8.05,0,35.0,35.0,8.05


In [10]:
fs

[<Feature: Pclass>,
 <Feature: Name>,
 <Feature: Sex>,
 <Feature: Age>,
 <Feature: SibSp>,
 <Feature: Parch>,
 <Feature: Ticket>,
 <Feature: Fare>,
 <Feature: Cabin>,
 <Feature: Embarked>,
 <Feature: Pclass + SibSp>,
 <Feature: Age + Fare>,
 <Feature: Fare + Pclass>,
 <Feature: Parch + Pclass>,
 <Feature: Age + Pclass>,
 <Feature: Fare + Parch>,
 <Feature: Parch + SibSp>,
 <Feature: Age + Parch>,
 <Feature: Age + SibSp>,
 <Feature: Fare + SibSp>]

# おわりに
- 1テーブルでも活用の道はありそう
- 意味のある集約計算と意味のない集約計算(今回だとand)があるので、よくみたほうがいいね