# 基本設定

In [1]:
import setting

import pandas as pd
import numpy as np
import math
import pathlib as Path

import scipy as sc
import sklearn
import pickle
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sb
sb.set(font='IPAexGothic')

# データ格納先設定
data_path = Path.Path('../data')
# 接頭辞
prefix = 'ana201'

# データ抽出
> 決済情報（df_receipt.pkl）\
> 商品情報（df_product）\
> カテゴリ情報（df_category)\
> 顧客情報（名寄せ用）\

In [2]:
# 決済情報
df_receipt = pd.read_pickle(data_path / 'df_receipt.pkl')

# 商品情報
df_product = pd.read_pickle(data_path / 'df_product.pkl')

# カテゴリ情報
df_category = pd.read_pickle(data_path / 'df_category.pkl')

# 顧客情報（名寄せ）
df_customer = pd.read_pickle(data_path / 'ana101_df_customer_nys.pkl')

# データ結合

In [3]:
# 決済 + （商品 + カテゴリ）
df = df_receipt[

    # 非会員を除外
    ~df_receipt['customer_id'].str.startswith('Z')

].merge(
    
    # 商品コード + カテゴリコード(category_small_cd)
    df_product[['product_cd', 'category_small_cd']].merge(
        
        # カテゴリコード + カテゴリ名
        df_category[['category_small_cd', 'category_major_name', 'category_medium_name', 'category_small_name', ]],
        on='category_small_cd',
        how='inner'
        
    # カテゴリコードは要らない
    ).drop('category_small_cd', axis=1),
    
    on='product_cd',
    how='left'
    
)
df.shape

(65682, 12)

# 名寄せ

In [4]:
df_nys = df.merge(df_customer[['customer_id', 'customer_id_nys']], on='customer_id', how='inner')

# データ加工

In [5]:
# 決済年月日をdate型に変換
df_nys['sales_ymd'] = pd.to_datetime(df_nys['sales_ymd'].astype('str'))

# 決済年月を計算
df_nys['sales_ym'] = df_nys['sales_ymd'].dt.strftime('%Y%m')

# 決済曜日を計算
# 月～日（0 ~ 6）
df_nys['sales_date'] = df_nys['sales_ymd'].dt.weekday

df_nys.head()

Unnamed: 0,sales_ymd,sales_epoch,store_cd,receipt_no,receipt_sub_no,customer_id,product_cd,quantity,amount,category_major_name,category_medium_name,category_small_name,customer_id_nys,sales_ym,sales_date
0,2018-11-03,1541203200,S14006,112,1,CS006214000001,P070305012,1,158,瓶詰缶詰,和風調味料,砂糖・甘味料,CS006214000001,201811,5
1,2017-05-09,1494288000,S14006,112,1,CS006214000001,P071401004,1,1100,瓶詰缶詰,タバコ,タバコ,CS006214000001,201705,1
2,2017-06-08,1496880000,S14006,112,1,CS006214000001,P060104021,1,120,パン・乳製品,パン,その他パン,CS006214000001,201706,3
3,2017-06-08,1496880000,S14006,112,2,CS006214000001,P080403001,1,175,菓子,ビスケット,油菓子,CS006214000001,201706,3
4,2018-10-28,1540684800,S14006,112,2,CS006214000001,P050102004,1,188,和日配,水物,油揚げ・がんも,CS006214000001,201810,6


## 顧客×年月×購入曜日ごとに集約

In [6]:
%%time
# 集約関数を定義
def f_grp(x):
    d = {}
    
    # 売上金額合計
    d['sum_amt'] = x['amount'].sum()
    # 購入商品数
    d['sum_qnt'] = x['quantity'].sum()
    # 決済回数
    # d['cnt_pay'] = x['customer_id_nys'].count()
    
    return pd.Series(d)
    
_rec  = pd.concat(
    [
        # 顧客×年月×曜日
        df_nys.groupby(['customer_id_nys', 'sales_ym', 'sales_date']).apply(f_grp).reset_index(),
        # 顧客×年月
        df_nys.groupby(['customer_id_nys', 'sales_ym']).apply(f_grp).reset_index()
        
    ],axis=0
    
# 顧客×年月の曜日は全体合計
).fillna({'sales_date':99}).astype({'sales_date':'Int64'}).sort_values(['customer_id_nys', 'sales_ym', 'sales_date'])

CPU times: user 31 s, sys: 646 ms, total: 31.7 s
Wall time: 30.3 s


In [7]:
# 顧客単位に横持
fix_rec = _rec.set_index(['customer_id_nys', 'sales_ym', 'sales_date'])
fix_rec

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum_amt,sum_qnt
customer_id_nys,sales_ym,sales_date,Unnamed: 3_level_1,Unnamed: 4_level_1
CS001113000004,201903,4,1298,2
CS001113000004,201903,99,1298,2
CS001114000005,201805,3,438,2
CS001114000005,201805,99,438,2
CS001114000005,201907,2,188,2
...,...,...,...,...
CS051515000002,201910,99,265,2
CS052212000002,201910,3,192,2
CS052212000002,201910,99,192,2
CS052514000001,201908,3,178,2


# データ保存

In [8]:
fix_rec.to_pickle(data_path / ('%s_fix_rec.pkl' % prefix))