# 基本設定

In [1]:
import setting

import pandas as pd
import numpy as np
import math
import pathlib as Path

import scipy as sc
import sklearn
import pickle
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sb
sb.set(font='IPAexGothic')

# データ格納先設定
data_path = Path.Path('../data')
# 接頭辞
prefix = 'ana301'

# データ抽出
> 軸データ（ana103_base_smpl.pkl）\
> 決済加工情報（ana201_fix_rec.pkl）

In [2]:
# 軸データ
base = pd.read_pickle(data_path / 'ana103_base_smpl.pkl')
# 決済情報
df = pd.read_pickle(data_path / 'ana201_fix_rec.pkl')

In [3]:
base.shape

(3364, 3)

# データ結合

In [4]:
# 軸データに外部結合
_df = base.merge(df.reset_index(), on='customer_id_nys', how='left')

In [5]:
_df['customer_id_nys'].nunique()

3364

# 特徴量作成

In [6]:
# 観測期間の抽出
_rec = _df[_df['sales_ym'].isin(setting.ym_bef.values())]
_rec

Unnamed: 0,customer_id_nys,flg_sts,flg_train,sales_ym,sales_date,sum_amt,sum_qnt
0,CS005415000212,0,1,201701,3,188,2
1,CS005415000212,0,1,201701,99,188,2
2,CS005415000212,0,1,201703,1,338,2
3,CS005415000212,0,1,201703,3,526,2
4,CS005415000212,0,1,201703,99,864,4
...,...,...,...,...,...,...,...
43209,CS014414000069,0,0,201704,99,273,2
43210,CS014414000069,0,0,201706,3,455,2
43211,CS014414000069,0,0,201706,99,455,2
43212,CS014414000069,0,0,201712,6,335,2


## 顧客×購入曜日（全体）ごとに集約統計量を計算

In [7]:
# 集約関数の定義
def f_grp(x, col):
    d = {}
    # 平均値
    d['FTR_avg_%s' % col] = x[col].mean()
    # 合計
    d['FTR_sum_%s' % col] = x[col].sum()
    # 最小
    d['FTR_min_%s' % col] = x[col].min()
    # 最大
    d['FTR_max_%s' % col] = x[col].max()
    # レンジ
    d['FTR_rng_%s' % col] = x[col].max() - x[col].min()
    # 標準偏差
    d['FTR_std_%s' % col] = x[col].std()
    
    return d

In [8]:
ftr_rec = _rec.groupby(['customer_id_nys', 'sales_date']).apply(
    
    lambda x : pd.Series(f_grp(x, 'sum_amt') | f_grp(x, 'sum_qnt'))
    
).fillna(0)

In [9]:
# 顧客単位に横持変換
_tmp = ftr_rec.unstack().fillna(0)

In [10]:
# 列名を整理
_tmp.columns = ['_'.join([i, str(j)]) for i, j in _tmp.columns]
_tmp

Unnamed: 0_level_0,FTR_avg_sum_amt_0,FTR_avg_sum_amt_1,FTR_avg_sum_amt_2,FTR_avg_sum_amt_3,FTR_avg_sum_amt_4,FTR_avg_sum_amt_5,FTR_avg_sum_amt_6,FTR_avg_sum_amt_99,FTR_sum_sum_amt_0,FTR_sum_sum_amt_1,...,FTR_rng_sum_qnt_6,FTR_rng_sum_qnt_99,FTR_std_sum_qnt_0,FTR_std_sum_qnt_1,FTR_std_sum_qnt_2,FTR_std_sum_qnt_3,FTR_std_sum_qnt_4,FTR_std_sum_qnt_5,FTR_std_sum_qnt_6,FTR_std_sum_qnt_99
customer_id_nys,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CS001115000010,0.0,0.0,0.0,2320.0,0.0,0.0,0.0,2320.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
CS001205000004,0.0,0.0,0.0,850.0,0.0,0.0,0.0,850.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
CS001214000009,1474.5,0.0,0.0,198.0,0.0,0.0,0.0,1573.5,2949.0,0.0,...,0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.707107
CS001214000048,0.0,0.0,0.0,283.0,0.0,0.0,0.0,283.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
CS001215000005,286.0,0.0,0.0,0.0,0.0,0.0,0.0,286.0,286.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CS043415000010,1285.0,0.0,1328.0,0.0,0.0,0.0,0.0,1306.5,1285.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
CS043514000010,0.0,0.0,0.0,0.0,1100.0,0.0,0.0,1100.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
CS044414000004,228.0,0.0,0.0,0.0,0.0,0.0,0.0,228.0,228.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
CS044415000001,0.0,0.0,0.0,2535.0,0.0,0.0,0.0,2535.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


# データ保存

In [11]:
_tmp.to_pickle(data_path / ('%s_ftr_rec.pkl' % prefix))