データを読み込む

In [None]:
import pandas as pd

In [None]:
use_log = pd.read_csv('./input/03/use_log.csv')
use_log.head()

In [None]:
customer_master = pd.read_csv('./input/03/customer_master.csv')
customer_master.head()

In [None]:
class_master = pd.read_csv('./input/03/class_master.csv')
class_master.head()

In [None]:
campaign_master = pd.read_csv('./input/03/campaign_master.csv')
campaign_master.head()

顧客情報を可視化

In [None]:
customer_join = pd.merge(customer_master, campaign_master, on='campaign_id', how='left')
customer_join = pd.merge(customer_join, class_master, on='class', how='left')
customer_join.head()

欠損の確認

In [None]:
customer_join.isnull().sum()

`end_date`は退会してない顧客は`NaN`なので欠損値になる

クラス単位で顧客を分類  
可視化を行う

In [None]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

import seaborn as sns
fonts = set([f.name for f in matplotlib.font_manager.fontManager.ttflist])
print(fonts)

In [None]:

gender_class = customer_join.groupby('gender').count()['customer_id']
print(gender_class)
gender_class.plot.bar()
plt.show()
#sns.histplot(x='class_name', y='customer_id', data=customer_join)

In [None]:
sns.histplot(x='gender', data=customer_join)

In [None]:
customer_join.groupby('is_deleted').count()['customer_id']

In [None]:
customer_join.groupby('class_name').count()['customer_id']

In [None]:
customer_join.groupby('campaign_name').count()['customer_id']

In [None]:
customer_join.groupby('gender').count()['customer_id']

最新のユーザ数を集計

In [None]:
customer_join['start_date'] = pd.to_datetime(customer_join['start_date'])
customer_join['end_date'] = pd.to_datetime(customer_join['end_date'])
customer_start = customer_join.loc[customer_join['start_date'] > pd.to_datetime('20180401')]
print(len(customer_start))

In [None]:
customer_join.groupby('end_date').count()['customer_id']

In [None]:
customer_join.groupby('start_date').count()['customer_id']

最新顧客データで集計を行う  
今の顧客データから`2019/3/31`時点に在籍している顧客のみに限定

In [None]:
customer_current = customer_join.loc[(customer_join['end_date'] >= pd.to_datetime('20190331')) | (customer_join['end_date'].isna())]
print(len(customer_current))
customer_current['end_date'].unique()

最新の顧客情報の集計

In [None]:
customer_current.groupby('gender').count()['customer_id']

In [None]:
customer_current.groupby('campaign_name').count()['customer_id']

In [None]:
customer_current.groupby('class_name').count()['customer_id']

利用履歴データを使用する

月別の`平均値`、`中央値`、`最大値`、`最小値`を集計してみる

In [None]:
use_log.info()

In [None]:
use_log

In [None]:
use_log['usedate'] = pd.to_datetime(use_log['usedate'])
use_log['年月'] = use_log['usedate'].dt.strftime("%Y%m")
use_log_month = use_log.groupby(['年月', 'customer_id'], as_index=False).count()
use_log_month.rename(columns={'log_id':'count'}, inplace=True)
del use_log_month['usedate']
use_log_month.head()

In [None]:
use_log_month

ここから顧客毎に月の利用回数の`平均値`、`中央値`、`最大値`、`最小値`を集計する

In [None]:
use_log_customer = use_log_month.groupby('customer_id').agg(['mean', 'median', 'max', 'min'])['count']
use_log_customer.reset_index(drop=False)
use_log_customer.info()
use_log_customer

その顧客が定期利用者かどうかを判断するフラグを作成  
毎週同じ曜日に利用しているかどうかで判断する  
`0`から`6`で、月曜から日曜に対応する

In [None]:
use_log['weekday'] = use_log['usedate'].dt.weekday
use_log_weekday = use_log.groupby(['customer_id', '年月', 'weekday'], as_index=False).count()[['customer_id', '年月', 'weekday', 'log_id']]
use_log_weekday.rename(columns={'log_id':'count'}, inplace=True)

In [None]:
use_log_weekday

`AS002855`は2018年の4月に土曜日(5)に4回ジムに来ている。5月にも土曜日4回来ている。

In [None]:
use_log_weekday = use_log_weekday.groupby('customer_id', as_index=False).max()[['customer_id', 'count']]
use_log_weekday['routine_flag'] = 0
use_log_weekday['routine_flag'] = use_log_weekday['routine_flag'].where(use_log_weekday['count']<4, 1)

In [None]:
use_log_weekday

`use_log_weekday`と`use_log_customer`を結合して`customer_join`を作成する

In [None]:
customer_join = pd.merge(customer_join, use_log_weekday, on='customer_id', how='left')
customer_join = pd.merge(customer_join, use_log_customer, on='customer_id', how='left')
customer_join

In [None]:
customer_join.isnull().sum()

会員期間の計算  
まだ退会していない会員は`end_date`が入ってないので、`2019-04-30`として計算する

In [None]:
from dateutil.relativedelta import relativedelta
customer_join['calc_date'] = customer_join['end_date']
customer_join['calc_date'] = customer_join['calc_date'].fillna(pd.to_datetime('20190430'))
customer_join['membership_period'] = 0
for i in range(len(customer_join)):
    delta = relativedelta(customer_join['calc_date'].iloc[i], customer_join['start_date'].iloc[i])
    customer_join['membership_period'].iloc[i] = delta.years*12 + delta.months

In [None]:
customer_join

顧客行動の統計量をはかる

In [None]:
customer_join[['mean', 'median', 'max', 'min']].describe()

In [None]:
customer_join.groupby('routine_flag').count()['customer_id']

定期的に利用している顧客の方がはるかに多い

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.hist(customer_join['membership_period'])

In [None]:
sns.histplot(customer_join['membership_period'])

10ヶ月以下の加入顧客が多く、10ヶ月以上になると横ばいになる。つまり短期で顧客が離れてい行くが、続ける顧客は長く加入していることを示す。

In [None]:
customer_end = customer_join.loc[~customer_join['end_date'].isna()]
customer_end.describe()

In [None]:
customer_stay = customer_join.loc[customer_join['end_date'].isna()]
customer_stay.describe()

In [None]:
customer_join.to_csv('./input/03/customer_join.csv', index=False)