# README
- [DataFrameで特徴量作るのめんどくさ過ぎる。。featuretoolsを使って自動生成したろ - Qiita](https://qiita.com/Hyperion13fleet/items/4eaca365f28049fe11c7)
- 上記はコピペで試したので、今度は自分でやって理解を深めるノート
- 特に名前の省略が気に食わない〜〜〜！！！


# データの準備

In [1]:
import featuretools
import pandas as pd
import copy


data = featuretools.demo.load_mock_customer()

In [2]:
# いちいちKey打つのだるい&型がわかりにくいので変数化
df_customers = data['customers']
df_sessions = data['sessions']
df_transactions = data['transactions']

# データの確認
- 3つの関係性をよくみてくれ
- 親: Customers
    - 子: Sessions
        - 孫: Transactions

In [3]:
df_customers

Unnamed: 0,customer_id,zip_code,join_date,date_of_birth
0,1,60091,2011-04-17 10:48:33,1994-07-18
1,2,13244,2012-04-15 23:31:04,1986-08-18
2,3,13244,2011-08-13 15:42:34,2003-11-21
3,4,60091,2011-04-08 20:08:14,2006-08-15
4,5,60091,2010-07-17 05:27:50,1984-07-28


In [4]:
df_sessions.head()

Unnamed: 0,session_id,customer_id,device,session_start
0,1,2,desktop,2014-01-01 00:00:00
1,2,5,mobile,2014-01-01 00:17:20
2,3,4,mobile,2014-01-01 00:28:10
3,4,1,mobile,2014-01-01 00:44:25
4,5,4,mobile,2014-01-01 01:11:30


In [5]:
# 'amount' が数値データだぞ！
df_transactions.head()

Unnamed: 0,transaction_id,session_id,transaction_time,product_id,amount
0,298,1,2014-01-01 00:00:00,5,127.64
1,2,1,2014-01-01 00:01:05,2,109.48
2,308,1,2014-01-01 00:02:10,3,95.06
3,116,1,2014-01-01 00:03:15,4,78.92
4,371,1,2014-01-01 00:04:20,3,31.54


# featuretoolsでつくる

## EntitySetの作成とEntityの登録

In [6]:
# 単数や複数と言った命名に迷うね
entity_set = featuretools.EntitySet(id='data')

entity_set.entity_from_dataframe(entity_id='customer',
                                 dataframe=df_customers,
                                 index='customer_id')

entity_set.entity_from_dataframe(entity_id='session',
                                 dataframe=df_sessions,
                                 index='session_id')

entity_set.entity_from_dataframe(entity_id='transaction',
                                 dataframe=df_transactions,
                                 index='transaction_id')

entity_set

Entityset: data
  Entities:
    customer [Rows: 5, Columns: 4]
    session [Rows: 35, Columns: 4]
    transaction [Rows: 500, Columns: 5]
  Relationships:
    No relationships

## Relationshipの作成とRelationの登録

In [7]:
relation_customer_and_session = featuretools.Relationship(
    parent_variable=entity_set['customer']['customer_id'],
    child_variable=entity_set['session']['customer_id'])

relation_session_and_transaction = featuretools.Relationship(
    parent_variable=entity_set['session']['session_id'],
    child_variable=entity_set['transaction']['session_id'])


# まとめて登録できる(`add_relationships()`) 複数形だから注意ね
entity_set.add_relationships(relationships=[relation_customer_and_session,
                                            relation_session_and_transaction])
# 個別に登録する書き方はこっち
# entity_set.add_relationship(relationship=relation_customer_and_session)
# entity_set.add_relationship(relationship=relation_customer_and_session)

Entityset: data
  Entities:
    customer [Rows: 5, Columns: 4]
    session [Rows: 35, Columns: 4]
    transaction [Rows: 500, Columns: 5]
  Relationships:
    session.customer_id -> customer.customer_id
    transaction.session_id -> session.session_id

In [8]:
# RelationShipsの変化をみてね
entity_set

Entityset: data
  Entities:
    customer [Rows: 5, Columns: 4]
    session [Rows: 35, Columns: 4]
    transaction [Rows: 500, Columns: 5]
  Relationships:
    session.customer_id -> customer.customer_id
    transaction.session_id -> session.session_id

# DFSを実行する

## 集約関数を適用しないでやってみる
- デフォルトの挙動がよくわかる
- **Pandasでは Numeric と Categorical を区別したし、featuretoolsでの型も区別されているが、 どちらの集約関数も `agg_primitives` にわたすので注意ね**
- Numeric型へに適用される関数
    - SUM(transaction.amount)
    - STD(transaction.amount)
    - MAX(transaction.amount)
    - SKEW(transaction.amount)
    - MIN(transaction.amount)
    - MEAN(transaction.amount)
- 子のidは回数とか頻度が集約される
    - COUNT(transaction)
    - NUM_UNIQUE(transaction.product_id)
    - MODE(transaction.product_id)
- 親に関しては、KEYのみ
    - customer.zip_code
- datetime型は前回の通り(今回は自身のテーブルのみ)
    - DAY(session_start)
    - YEAR(session_start)
    - MONTH(session_start)
    - WEEKDAY(session_start)

In [9]:
df_feature_0, feature_defs_0 = featuretools.dfs(entityset=entity_set,
                                                target_entity='session',
                                                agg_primitives=None,
                                                trans_primitives=None,
                                                max_depth=1)

### DataFrameの比較


In [10]:
df_feature_0.head(2).T

session_id,1,2
customer_id,2,5
device,desktop,mobile
SUM(transaction.amount),1229.01,746.96
STD(transaction.amount),41.601,45.8936
MAX(transaction.amount),141.66,135.25
SKEW(transaction.amount),0.295458,-0.16055
MIN(transaction.amount),20.91,9.32
MEAN(transaction.amount),76.8131,74.696
COUNT(transaction),16,10
NUM_UNIQUE(transaction.product_id),5,5


In [11]:
df_sessions.head(2).T

Unnamed: 0,0,1
session_id,1,2
customer_id,2,5
device,desktop,mobile
session_start,2014-01-01 00:00:00,2014-01-01 00:17:20


In [12]:
entity_set['transaction'].variables

[<Variable: transaction_id (dtype = index)>,
 <Variable: session_id (dtype = id)>,
 <Variable: transaction_time (dtype: datetime, format: None)>,
 <Variable: product_id (dtype = categorical)>,
 <Variable: amount (dtype = numeric)>]

## datetimeに対して集約関数を使ってみる
- 予想通りの動き

In [13]:
agg_numeric = ['sum', 'max',
               'count', 'mode' # Categoricalにはコレが効く
              ]
agg_trans = ['year', 'month']  # いろいろ変えてみるといいよ

df_feature_1, feature_defs_1 = featuretools.dfs(entityset=entity_set,
                                                target_entity='session',
                                                agg_primitives=agg_numeric,
                                                trans_primitives=agg_trans,
                                                max_depth=1)

In [14]:
df_feature_1.head(3).T

session_id,1,2,3
customer_id,2,5,4
device,desktop,mobile,mobile
SUM(transaction.amount),1229.01,746.96,1329
MAX(transaction.amount),141.66,135.25,147.73
COUNT(transaction),16,10,15
MODE(transaction.product_id),3,5,1
YEAR(session_start),2014,2014,2014
MONTH(session_start),1,1,1
customer.zip_code,13244,60091,60091


In [15]:
df_feature_0.head(3).T

session_id,1,2,3
customer_id,2,5,4
device,desktop,mobile,mobile
SUM(transaction.amount),1229.01,746.96,1329
STD(transaction.amount),41.601,45.8936,46.24
MAX(transaction.amount),141.66,135.25,147.73
SKEW(transaction.amount),0.295458,-0.16055,-0.324012
MIN(transaction.amount),20.91,9.32,8.7
MEAN(transaction.amount),76.8131,74.696,88.6
COUNT(transaction),16,10,15
NUM_UNIQUE(transaction.product_id),5,5,5


## 親(Customers)に対してDFS

In [16]:
agg_numeric = ['sum', 'max',
               'count', 'mode' # Categoricalにはコレが効く
              ]

agg_trans = ['year']  # いろいろ変えてみるといいよ

df_feature_2, feature_defs_2 = featuretools.dfs(entityset=entity_set,
                                                target_entity='customer',
                                                agg_primitives=agg_numeric,
                                                trans_primitives=agg_trans,
                                                max_depth=1)

In [17]:
df_customers.head(3).T

Unnamed: 0,0,1,2
customer_id,1,2,3
zip_code,60091,13244,13244
join_date,2011-04-17 10:48:33,2012-04-15 23:31:04,2011-08-13 15:42:34
date_of_birth,1994-07-18 00:00:00,1986-08-18 00:00:00,2003-11-21 00:00:00


In [18]:
df_feature_2.head(3).T

customer_id,1,2,3
zip_code,60091,13244,13244
COUNT(session),8,7,6
MODE(session.device),mobile,desktop,desktop
YEAR(join_date),2011,2012,2011
YEAR(date_of_birth),1994,1986,2003


## 孫(Transactions)にDFS

In [19]:
agg_numeric = ['sum', 'max',
               'count', 'mode' # Categoricalにはコレが効く
              ]

agg_trans = ['year']  # いろいろ変えてみるといいよ

df_feature_3, feature_defs_3 = featuretools.dfs(entityset=entity_set,
                                                target_entity='transaction',
                                                agg_primitives=agg_numeric,
                                                trans_primitives=agg_trans,
                                                max_depth=1)

In [20]:
df_transactions.head(3).T

Unnamed: 0,0,1,2
transaction_id,298,2,308
session_id,1,1,1
transaction_time,2014-01-01 00:00:00,2014-01-01 00:01:05,2014-01-01 00:02:10
product_id,5,2,3
amount,127.64,109.48,95.06


In [21]:
df_feature_3.head(3).T

transaction_id,1,2,3
session_id,31,1,35
product_id,2,2,3
amount,21.77,109.48,62.49
YEAR(transaction_time),2014,2014,2014
session.customer_id,2,2,3
session.device,mobile,desktop,mobile


# おわりに
- だいぶ挙動は理解してきた
- pd.mergeとかを連発することを考えるとかなり協力だ
    - ってか、素で書くのたぶんすぐにできないよ