## 031 データを読み込んで確認する

In [None]:
import pandas as pd
uselog = pd.read_csv('./input/100knoks/03/use_log.csv')
uselog.isnull().sum()

In [None]:
customer = pd.read_csv('./input/100knoks/03/customer_join.csv')
customer.isnull().sum()

## 032 クラスタリングで顧客をグループ化

In [None]:
customer_clustering = customer[['mean', 'median', 'max', 'min', 'membership_period']].copy()
customer_clustering.head()

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
customer_clustering_sc = sc.fit_transform(customer_clustering)
customer_clustering_sc

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0)
clusters = kmeans.fit(customer_clustering_sc)
customer_clustering['cluster'] = clusters.labels_
print(customer_clustering['cluster'].unique())
customer_clustering.head()

## 033 クラスタリング結果の分析

In [None]:
customer_clustering.columns = ['月内平均', '月内中央値', '月内最大値', '月内最小値', '会員期間', 'cluster']
customer_clustering.groupby('cluster').count()

In [None]:
customer_clustering.groupby('cluster').mean()

### 解析
クループ0: 会員期間が長いが利用率が高いグループ  
グループ1: 会員期間が最も長い  
グループ2: 会員期間が短く利用率も低いグループ  
グループ3: 会員期間が平均的に長く、利用率も平均的なグループ

## 034  クラスタリング結果の可視化
主成分分析を行い次元を削除し2次元での可視化を行う。

In [None]:
from  sklearn.decomposition import PCA
X = customer_clustering_sc
pca = PCA(n_components=2)
pca.fit(X)
x_pca = pca.transform(X)
pca_df = pd.DataFrame(x_pca)
pca_df['cluster'] = customer_clustering['cluster']
pca_df

In [None]:
import seaborn as sns
sns.scatterplot(data=pca_df, x=0, y=1, hue='cluster')

In [None]:
import seaborn as sns
sns.set(font='Ricty')
sns.scatterplot(data=customer_clustering, x='会員期間', y='月内平均', hue='cluster')

## 035 クラスタリング結果から退会顧客の傾向を把握

In [None]:
customer_clustering = pd.concat([customer_clustering, customer], axis=1)
customer_clustering.groupby(['cluster', 'is_deleted'], as_index=False).count()[['cluster', 'is_deleted', 'customer_id']]

In [None]:
customer_clustering.groupby(['cluster', 'routine_flg'], as_index=False).count()[['cluster', 'routine_flg', 'customer_id']]

## 036 翌月の利用回数予測を行う準備

In [None]:
uselog['usedate'] = pd.to_datetime(uselog['usedate'])
uselog['年月'] = uselog['usedate'].dt.strftime("%Y%m")
uselog_months = uselog.groupby(['年月', 'customer_id'], as_index=False).count()
uselog_months.rename(columns={'log_id': 'count'}, inplace=True)
del uselog_months['usedate']
uselog_months

In [None]:
year_months = list(uselog_months['年月'].unique())
print(year_months)
predict_date = pd.DataFrame()
for i in range(6, len(year_months)):
    tmp = uselog_months.loc[uselog_months['年月'] == year_months[i]].copy()
    tmp.rename(columns={'count': 'count_pred'}, inplace=True)
    for j in range(1, 7):
        tmp_before = uselog_months.loc[uselog_months['年月'] == year_months[i-j]].copy()
        del tmp_before['年月']
        tmp_before.rename(columns={'count': "count_{}".format(j-1)}, inplace=True)
        tmp = pd.merge(tmp, tmp_before, on='customer_id', how='left')
    predict_date = pd.concat([predict_date, tmp], ignore_index=True)
predict_date.head()

### 6ヶ月以上利用者に限定

In [None]:
predict_date = predict_date.dropna()
predict_date = predict_date.reset_index(drop=True)
predict_date.head()

In [None]:
predict_date = pd.merge(predict_date, customer[['customer_id', 'start_date']], on='customer_id', how='left')
predict_date.head()

In [None]:
predict_date['now_date'] = pd.to_datetime(predict_date['年月'], format="%Y%m")
predict_date['start_date'] = pd.to_datetime(predict_date['start_date'])

from dateutil.relativedelta import relativedelta
predict_date['period'] = None
for i in range(len(predict_date)):
    delta = relativedelta(predict_date['now_date'][i], predict_date['start_date'][i])
    predict_date.loc[i, 'period'] = delta.years * 12 + delta.months
predict_date.head()

## 038 来月の利用回数予測モデルを作成


In [None]:
predict_date = predict_date.loc[predict_date['start_date'] >= pd.to_datetime('20180401')]
predict_date.head()

In [None]:
from sklearn import linear_model
import sklearn.model_selection

model = linear_model.LinearRegression()
X = predict_date[['count_0', 'count_1', 'count_2', 'count_3', 'count_4', 'count_5', 'period']]
y = predict_date['count_pred']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)
model.fit(X_train, y_train)


In [None]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

## 039 モデルに寄与している変数を確認

In [None]:
coef = pd.DataFrame({'feature_names': X.columns, 'coefficient': model.coef_})
coef

In [None]:
x1 = [3, 4, 4, 6, 8, 7, 8]
x2 = [2, 2, 3, 3, 4, 6, 8]
x_pred = [x1, x2]

In [None]:
model.predict(x_pred)

In [None]:
uselog_months.to_csv('./input/100knoks/03/use_log_months.csv', index=False)