# 基本設定

In [1]:
import setting

import pandas as pd
import numpy as np
import math
import pathlib as Path

import scipy as sc
import sklearn
import pickle
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sb
sb.set(font='IPAexGothic')

# データ格納先設定
data_path = Path.Path('../data')
# 接頭辞
prefix = 'ana102'

# データ抽出

In [2]:
# 決済情報
df_receipt = pd.read_pickle(data_path / 'df_receipt.pkl')

# 顧客情報（名寄せ顧客ID）
df_customer = pd.read_pickle(data_path / 'ana101_df_customer_nys.pkl')

# 名寄せ

In [3]:
df = df_receipt.merge(df_customer[['customer_id', 'customer_id_nys']], on='customer_id', how='inner')

# データ加工

In [4]:
# 決済年月を計算
df['sales_ym'] = pd.to_datetime(df['sales_ymd'].astype('str')).dt.strftime('%Y%m')

In [5]:
# （顧客ID × 決済年月）単位に売上金額合計を計算
_rec  = df.groupby(['customer_id_nys', 'sales_ym'])['amount'].agg([('sum_amt', 'sum')]).reset_index()
_rec

Unnamed: 0,customer_id_nys,sales_ym,sum_amt
0,CS001113000004,201903,1298
1,CS001114000005,201805,438
2,CS001114000005,201907,188
3,CS001115000010,201712,2320
4,CS001115000010,201807,146
...,...,...,...
29408,CS051212000001,201910,336
29409,CS051513000004,201907,551
29410,CS051515000002,201910,265
29411,CS052212000002,201910,192


# 観測期間と予測期間の設定

In [6]:
_rec['flg_predict'] = _rec['sales_ym'].apply(
    
    lambda x : 
    
    # 予測期間
    1 if x in setting.ym_aft.values() 
    
    # 観測期間
    else 0 if x in setting.ym_bef.values() else np.nan
    
# 対象外年月をNA    
).astype('Int64')   

# 目的変数の設定
> 利用金額合計が、観測期間に対して予測期間で増加している先が1、変化なしまたは減少している先が0

In [7]:
# 観測期間と予測期間の合計金額
_tmp2 = pd.pivot_table(
    
    # 分析対象外年月のデータを除外
    _rec.dropna(), 
    
    # 顧客ごとに、観測期間と予測期間の利用金額合計を計算
    index='customer_id_nys', columns='flg_predict', values='sum_amt', aggfunc='sum'

# 利用がない年月を0円補完
).fillna(0).astype('Int64')

In [8]:
# 列名を成形
_tmp2.columns = ['sum_samt_%s' % i for i in _tmp2.columns]
_tmp2 = _tmp2.reset_index()

# 観測期間と予測期間の比率
_tmp2['rat_samt'] = np.divide(_tmp2['sum_samt_1'], _tmp2['sum_samt_0'])

# 予測期間 > 観測期間
_tmp2['flg_sts'] = np.where(_tmp2['rat_samt'] > 1, 1, 0)
_tmp2

Unnamed: 0,customer_id_nys,sum_samt_0,sum_samt_1,rat_samt,flg_sts
0,CS001114000005,0,438,inf,1
1,CS001115000010,2320,146,0.062931,0
2,CS001205000004,850,436,0.512941,0
3,CS001205000006,0,2851,inf,1
4,CS001212000027,448,0,0.0,0
...,...,...,...,...,...
7155,CS046615000004,0,190,inf,1
7156,CS047211000001,0,751,inf,1
7157,CS049115000001,0,846,inf,1
7158,CS049513000008,0,1130,inf,1


# 分析対象外データを除外
> ・観測期間の利用金額合計が0円\
> ・予測期間の利用金額合計が0円\
> 上記いずれかに該当する先（分析対象外）

In [9]:
_tmp3 = _tmp2[
    
    # 非会員を除く
    (~_tmp2['customer_id_nys'].str.startswith('Z')) & 
    # 観測期間0円、予測期間0円を除く
    (_tmp2['rat_samt'] != np.inf) & (_tmp2['rat_samt'] != 0)

].sort_values('rat_samt', ascending=False)
_tmp3

Unnamed: 0,customer_id_nys,sum_samt_0,sum_samt_1,rat_samt,flg_sts
3659,CS019415000156,140,9276,66.257143,1
6666,CS038415000123,202,7940,39.306931,1
5142,CS028515000004,348,11047,31.744253,1
4308,CS023515000153,249,7033,28.24498,1
4210,CS023414000011,178,4672,26.247191,1
...,...,...,...,...,...
1366,CS006415000147,9887,346,0.034995,0
2641,CS013515000164,5584,188,0.033668,0
2957,CS015515000034,10818,358,0.033093,0
4730,CS026414000059,7828,158,0.020184,0


# データ保存

In [10]:
_tmp3[['customer_id_nys', 'flg_sts']].sort_values('customer_id_nys').to_pickle(data_path / ('%s_base.pkl' % prefix))