# README
- [DataFrameで特徴量作るのめんどくさ過ぎる。。featuretoolsを使って自動生成したろ - Qiita](https://qiita.com/Hyperion13fleet/items/4eaca365f28049fe11c7) をまずはやったほうが良いかも。


# デモ用のデータを確認する

In [1]:
import featuretools as ft
import pandas as pd

data = ft.demo.load_mock_customer()

In [2]:
type(data), data.keys()

(dict, dict_keys(['customers', 'sessions', 'transactions', 'products']))

## customers

In [3]:
data['customers'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
customer_id      5 non-null int64
zip_code         5 non-null object
join_date        5 non-null datetime64[ns]
date_of_birth    5 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 240.0+ bytes


In [4]:
data['customers']

Unnamed: 0,customer_id,zip_code,join_date,date_of_birth
0,1,60091,2011-04-17 10:48:33,1994-07-18
1,2,13244,2012-04-15 23:31:04,1986-08-18
2,3,13244,2011-08-13 15:42:34,2003-11-21
3,4,60091,2011-04-08 20:08:14,2006-08-15
4,5,60091,2010-07-17 05:27:50,1984-07-28


## sessions

In [5]:
data['sessions'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 0 to 34
Data columns (total 4 columns):
session_id       35 non-null int64
customer_id      35 non-null int64
device           35 non-null object
session_start    35 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 1.4+ KB


In [6]:
data['sessions'].head()

Unnamed: 0,session_id,customer_id,device,session_start
0,1,2,desktop,2014-01-01 00:00:00
1,2,5,mobile,2014-01-01 00:17:20
2,3,4,mobile,2014-01-01 00:28:10
3,4,1,mobile,2014-01-01 00:44:25
4,5,4,mobile,2014-01-01 01:11:30


## transactionsテーブル

In [7]:
data['transactions'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
transaction_id      500 non-null int64
session_id          500 non-null int64
transaction_time    500 non-null datetime64[ns]
product_id          500 non-null category
amount              500 non-null float64
dtypes: category(1), datetime64[ns](1), float64(1), int64(2)
memory usage: 16.4 KB


In [8]:
data['transactions'].head()

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount
0,298,1,2014-01-01 00:00:00,5,127.64
1,2,1,2014-01-01 00:01:05,2,109.48
2,308,1,2014-01-01 00:02:10,3,95.06
3,116,1,2014-01-01 00:03:15,4,78.92
4,371,1,2014-01-01 00:04:20,3,31.54


# featuretoolsの流れ？
1. EntitySetの生成
2. 

# Entityを作成する

In [9]:
es = ft.EntitySet(id='demodat')

# Entityを追加
es.entity_from_dataframe(entity_id='cust',
                         dataframe=data['customers'],
                         index='customer_id')

es.entity_from_dataframe(entity_id='session',
                         dataframe=data['sessions'],
                         index='session_id')

es.entity_from_dataframe(entity_id='trans',
                         dataframe=data['transactions'],
                         index='transaction_id')

es

Entityset: demodat
  Entities:
    cust [Rows: 5, Columns: 4]
    session [Rows: 35, Columns: 4]
    trans [Rows: 500, Columns: 5]
  Relationships:
    No relationships

## ちなみに
- ちなみに**Entityは必ずユニークに識別出来る列を持っておく必要がある**ので、そこは注意が必要。

In [10]:
# ちなみにEntityは必ずユニークに識別出来る列を持っておく必要があるので、そこは注意が必要。
# AssertionError: Index is not unique on dataframe (Entity cust)
# まあ、例外が出るので親切だね
# es.entity_from_dataframe(entity_id='cust',
#                          dataframe=data['customers'],
#                          index='zip_code')

# Relationshipを生成する
- 3つのEntityを、それぞれRelationshipを定義して結合する

In [11]:
# generate relationship
r_cust_session = ft.Relationship(es['cust']['customer_id'],
                                 es['session']['customer_id'])

r_session_trans = ft.Relationship(es['session']['session_id'],
                                  es['trans']['session_id'])

# 作ったRelationshipを追加する(というか、Linkさせる)
es.add_relationships(relationships=[r_cust_session,
                                    r_session_trans])

Entityset: demodat
  Entities:
    cust [Rows: 5, Columns: 4]
    session [Rows: 35, Columns: 4]
    trans [Rows: 500, Columns: 5]
  Relationships:
    session.customer_id -> cust.customer_id
    trans.session_id -> session.session_id

# DataTypeを確認する
- 自動で変数を作成する処理に移る前に、featuretoolsが定義するデータのタイプについて把握しておく必要がある。
- いや、pandasの定義と同じやろ！！と思っていたのだが、意外と細かく設定されている。

## ポイント
IdやIndexのようなデータ型が存在し、Entityのユニークキーやその他のEntityと紐づけるためにキーを明示的定義しているところ。**IndexやIdのようなインデックス系のデータ定義がされているものは集約関数が適用されない**。



In [12]:
# 確認方法
es['cust'].variables

[<Variable: customer_id (dtype = index)>,
 <Variable: zip_code (dtype = categorical)>,
 <Variable: join_date (dtype: datetime, format: None)>,
 <Variable: date_of_birth (dtype: datetime, format: None)>]

In [13]:
# 確認方法
es['session'].variables

[<Variable: session_id (dtype = index)>,
 <Variable: customer_id (dtype = id)>,
 <Variable: device (dtype = categorical)>,
 <Variable: session_start (dtype: datetime, format: None)>]

In [14]:
# 確認方法
es['trans'].variables

[<Variable: transaction_id (dtype = index)>,
 <Variable: session_id (dtype = id)>,
 <Variable: transaction_time (dtype: datetime, format: None)>,
 <Variable: product_id (dtype = categorical)>,
 <Variable: amount (dtype = numeric)>]

# DFS(Deep Feature Synthesis)を実行する

## その1

In [15]:
# 集約用の関数を定義する
list_agg = ['sum', 'min', 'max', 'count']

# 変換用の関数を定義する(？)
# datetime型('join_date'カラムとか)の対応っぽいね
list_trans = ['year', 'month', 'day']

df_feature, features_defs = ft.dfs(entityset=es,
                                   target_entity='session',
                                   agg_primitives=list_agg,  # ???
                                   trans_primitives=list_trans,  # ???
                                   max_depth=1  # ???
                                   )

In [16]:
# 特徴量の個数の変化
len(data['sessions'].columns), len(df_feature.columns)

(4, 10)

In [17]:
# 増えたカラム
set(df_feature.columns) - set(data['sessions'].columns)

{'COUNT(trans)',
 'DAY(session_start)',
 'MAX(trans.amount)',
 'MIN(trans.amount)',
 'MONTH(session_start)',
 'SUM(trans.amount)',
 'YEAR(session_start)',
 'cust.zip_code'}

In [18]:
df_feature.head()

Unnamed: 0_level_0,customer_id,device,SUM(trans.amount),MIN(trans.amount),MAX(trans.amount),COUNT(trans),YEAR(session_start),MONTH(session_start),DAY(session_start),cust.zip_code
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2,desktop,1229.01,20.91,141.66,16,2014,1,1,13244
2,5,mobile,746.96,9.32,135.25,10,2014,1,1,60091
3,4,mobile,1329.0,8.7,147.73,15,2014,1,1,60091
4,1,mobile,1613.93,6.29,129.0,25,2014,1,1,60091
5,4,mobile,777.02,7.43,139.2,11,2014,1,1,60091


In [19]:
features_defs

[<Feature: customer_id>,
 <Feature: device>,
 <Feature: SUM(trans.amount)>,
 <Feature: MIN(trans.amount)>,
 <Feature: MAX(trans.amount)>,
 <Feature: COUNT(trans)>,
 <Feature: YEAR(session_start)>,
 <Feature: MONTH(session_start)>,
 <Feature: DAY(session_start)>,
 <Feature: cust.zip_code>]

## その2: depth=0 の場合
- depth=0の場合には、自分自身のEntityのうち、IndexやIdでないものだけが特徴量として生成される。

In [20]:
# define aggregate functions
list_agg = ['sum', 'median', 'count', 'std']

# define transfer functions
list_trans = ['year', 'month', 'day']

# run dfs
df_feature_depth0, _ = ft.dfs(entityset=es,
                              target_entity='cust',
                              agg_primitives=list_agg,
                              trans_primitives=list_trans,
                              max_depth=0)

In [21]:
# 特徴量の個数の変化
len(data['customers'].columns), len(df_feature_depth0.columns)

(4, 1)

In [22]:
df_feature_depth0

Unnamed: 0_level_0,zip_code
customer_id,Unnamed: 1_level_1
1,60091
2,13244
3,13244
4,60091
5,60091


## その3: depth=1
- depth=1の場合には、自分自身のEntityのうち、Transformの対象となっている変数と1階層下の子Entityに集計関数を適用した結果が特徴量として生成されている

In [23]:
# depth=1
# define aggregate functions
list_agg = ['sum', 'median', 'count', 'std']
# define transfer functions
list_trans = ['year', 'month', 'day']
# run dfs
df_feature_depth1, _ = ft.dfs(entityset=es,
                              target_entity='cust',
                              agg_primitives=list_agg,
                              trans_primitives=list_trans,
                              max_depth=1)
# count features
print(len(df_feature_depth1.columns))
print(list(df_feature_depth1.columns))

8
['zip_code', 'COUNT(session)', 'YEAR(join_date)', 'YEAR(date_of_birth)', 'MONTH(join_date)', 'MONTH(date_of_birth)', 'DAY(join_date)', 'DAY(date_of_birth)']


In [24]:
df_feature_depth1.head(1)

Unnamed: 0_level_0,zip_code,COUNT(session),YEAR(join_date),YEAR(date_of_birth),MONTH(join_date),MONTH(date_of_birth),DAY(join_date),DAY(date_of_birth)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,60091,8,2011,1994,4,7,17,18


## その3: depth=2
- depth=2の場合には、当然孫Entityまで含めて作成される。



In [25]:
# depth=2
# define aggregate functions
list_agg = ['sum', 'median', 'count', 'std']

# define transfer functions
list_trans = ['year', 'month', 'day']

# run dfs
df_feature_depth2, _ = ft.dfs(entityset=es,
                              target_entity='cust',
                              agg_primitives=list_agg,
                              trans_primitives=list_trans,
                              max_depth=2)
# count features
print(len(df_feature_depth2.columns))
print(list(df_feature_depth2.columns))

21
['zip_code', 'COUNT(session)', 'SUM(trans.amount)', 'MEDIAN(trans.amount)', 'COUNT(trans)', 'STD(trans.amount)', 'YEAR(join_date)', 'YEAR(date_of_birth)', 'MONTH(join_date)', 'MONTH(date_of_birth)', 'DAY(join_date)', 'DAY(date_of_birth)', 'SUM(session.MEDIAN(trans.amount))', 'SUM(session.STD(trans.amount))', 'MEDIAN(session.SUM(trans.amount))', 'MEDIAN(session.MEDIAN(trans.amount))', 'MEDIAN(session.COUNT(trans))', 'MEDIAN(session.STD(trans.amount))', 'STD(session.SUM(trans.amount))', 'STD(session.MEDIAN(trans.amount))', 'STD(session.COUNT(trans))']


In [26]:
df_feature_depth2.head(1)

Unnamed: 0_level_0,zip_code,COUNT(session),SUM(trans.amount),MEDIAN(trans.amount),COUNT(trans),STD(trans.amount),YEAR(join_date),YEAR(date_of_birth),MONTH(join_date),MONTH(date_of_birth),...,DAY(date_of_birth),SUM(session.MEDIAN(trans.amount)),SUM(session.STD(trans.amount)),MEDIAN(session.SUM(trans.amount)),MEDIAN(session.MEDIAN(trans.amount)),MEDIAN(session.COUNT(trans)),MEDIAN(session.STD(trans.amount)),STD(session.SUM(trans.amount)),STD(session.MEDIAN(trans.amount)),STD(session.COUNT(trans))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60091,8,9025.62,69.715,126,40.442059,2011,1994,4,7,...,18,551.405,312.745952,1038.83,63.88,15.0,40.006227,279.510713,19.024655,4.062019


# 遊んでみようとおもったけど、別ノートにしよう