In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
# seabornのデフォルトパラメータに設定するコマンド
sns.set()

# 訓練データと検証データに分ける関数
from sklearn.model_selection import train_test_split 
# 混同行列, 予測正解率を求める関数
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [2]:
%matplotlib inline

In [3]:
tran_blocks = pd.read_csv('C:/Users/hiroshi_kuriyama/Desktop/新人研修/08_データ分析演習1/01_データ/analysis/transactions_last_blocks_with_mem_logs_median.txt', engine='python')

In [4]:
# 日付型に変換
tran_blocks['initial_date'] = pd.to_datetime(tran_blocks['initial_date'])
tran_blocks['last_date'] = pd.to_datetime(tran_blocks['last_date'])
tran_blocks['registration_init_time'] = pd.to_datetime(tran_blocks['registration_init_time'])

In [5]:
# プラン料金に対して実際に支払った金額の割合

tran_blocks['paid_ratio'] = tran_blocks['paid_mean'] / tran_blocks['price_mean']
# プラン料金が0のときは0を代入
tran_blocks['paid_ratio'] = tran_blocks['paid_ratio'].replace(np.inf, 0)
tran_blocks['paid_ratio'] = tran_blocks['paid_ratio'].replace(np.NaN, 0)

In [6]:
# membersデータから特徴量作成
#city == 1 かどうか
tran_blocks['is_city_1'] = (tran_blocks['city'] == 1) * 1


#gender == male かどうか
#gender == female かどうか
# ※NULLはともに0のレコード
tran_blocks['is_male'] = (tran_blocks['gender'] == 1) * 1
tran_blocks['is_gen_null'] = tran_blocks['gender'].isnull() * 1


#registered_via をダミー変数に置き換える
ctgl_registered_via = pd.get_dummies(tran_blocks['registered_via'], drop_first=True, prefix='is_registered_via')
tran_blocks = pd.concat([tran_blocks, ctgl_registered_via], axis=1)

#method_mode をダミー変数に置き換える
ctgl_registered_via = pd.get_dummies(tran_blocks['method_mode'], drop_first=True, prefix='is_method_mode')
tran_blocks = pd.concat([tran_blocks, ctgl_registered_via], axis=1)

In [7]:
# tran_blocks.dtypes

In [8]:
tran_blocks.describe(include='all')

Unnamed: 0,is_churn,is_cancel,cumul_purchase_num,cumul_valid_days,return_point,transaction_freq,valid_days,price_mean,paid_mean,plan_days_mean,...,is_method_mode_32,is_method_mode_33,is_method_mode_34,is_method_mode_35,is_method_mode_36,is_method_mode_37,is_method_mode_38,is_method_mode_39,is_method_mode_40,is_method_mode_41
count,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0,...,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0,185896.0
unique,,,,,,,,,,,...,,,,,,,,,,
top,,,,,,,,,,,...,,,,,,,,,,
freq,,,,,,,,,,,...,,,,,,,,,,
first,,,,,,,,,,,...,,,,,,,,,,
last,,,,,,,,,,,...,,,,,,,,,,
mean,0.760377,0.019473,3.782066,158.248139,0.175778,1.52666,85.558081,244.053719,258.738506,56.453284,...,0.065623,0.016617,0.028489,0.215911,0.094273,0.03226,0.243539,0.071406,0.081244,0.052298
std,0.426855,0.138182,5.106224,238.94598,0.323684,2.2047,133.733644,418.531125,423.525001,98.021804,...,0.247622,0.127831,0.166366,0.411454,0.292209,0.17669,0.429219,0.257502,0.273211,0.222628
min,0.0,0.0,1.0,-85.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,0.0,1.0,30.0,0.0,74.5,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# 欠損値があるか確認
#tran_blocks.isnull().any() 

In [10]:
#tran_blocks.columns

In [11]:
# 乱数の種を設定
np.random.seed(10) # 訓練データとテストデータに分ける際に, データの再現性を得られるようにする

# 特徴量と目的変数に分ける
X_kk = tran_blocks[['is_cancel', 'cumul_purchase_num', 'transaction_freq',
       'valid_days', 'price_mean', 'paid_mean', 'plan_days_mean',
       'plan_days_mode', 'price_mode', 
       'total_secs_per_day', 'num_unq_per_day', 'on_ratio', 'off_days_before_expire',
        'paid_ratio', 'is_city_1',  'is_male', 'is_gen_null',
       'is_registered_via_4', 'is_registered_via_7',
       'is_registered_via_9', 'is_registered_via_13',
        'is_method_mode_20',
       'is_method_mode_22',
       'is_method_mode_28', 'is_method_mode_29',
       'is_method_mode_30', 'is_method_mode_31', 'is_method_mode_32',
       'is_method_mode_33', 'is_method_mode_34', 'is_method_mode_35',
       'is_method_mode_36', 'is_method_mode_37', 'is_method_mode_38',
       'is_method_mode_39', 'is_method_mode_40', 'is_method_mode_41']]
y_kk = tran_blocks['is_churn']

# 特徴量名を取り出しておく
feature_x_kk = X_kk.columns
feature_y_kk = y_kk.name

# 学習データと評価データに分ける
X_kk_train, X_kk_test, y_kk_train, y_kk_test = train_test_split(X_kk, y_kk, test_size=0.2, random_state=0)
print("shape of X and y for training: ", X_kk_train.shape, y_kk_train.shape)
print("shape of X and y for testing: ", X_kk_test.shape, y_kk_test.shape)

shape of X and y for training:  (148716, 37) (148716,)
shape of X and y for testing:  (37180, 37) (37180,)


In [12]:
# 離反、再購読の割合を1:1にする
kk_test = pd.concat([y_kk_test, X_kk_test], axis=1)
kk_test_return = kk_test.query('is_churn == 0')
kk_test_return_num = len(kk_test_return)
kk_test_churn_sam = kk_test.query('is_churn == 1').sample(n=kk_test_return_num, random_state=0)
kk_test_sam = pd.concat([kk_test_return, kk_test_churn_sam], axis=0)

X_kk_test = kk_test_sam.iloc[:,1:]
y_kk_test = kk_test_sam.iloc[:,:1]

y_kk_test.mean()

is_churn    0.5
dtype: float64

### 分類木

In [13]:
from sklearn.tree import DecisionTreeClassifier
# 以下, 決定木の可視化のための関数
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus

ModuleNotFoundError: No module named 'pydotplus'

* DecisionTreeClassifier:
    * random_state: 乱数のシード
    * max_depth: 木の深さ

In [None]:
# 学習
clf_dt_kk = DecisionTreeClassifier(random_state = 0, max_depth = 3)
clf_dt_kk.fit(X=X_kk_train, y=y_kk_train)

In [None]:
# 予測
mat_dt_kk_prob = clf_dt_kk.predict_proba(X_kk_test) # 予測確率
y_dt_kk_pred = clf_dt_kk.predict(X_kk_test) # 予測クラス

In [None]:
# 分類木をプロット
dot_data = StringIO()
export_graphviz(decision_tree = clf_dt_kk, 
                out_file = dot_data, 
                feature_names = feature_x_kk,
                class_names = ["churn", "returned"])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

In [None]:
# 混同行列・正解率確認
cm_dt_kk = confusion_matrix(y_kk_test, y_dt_kk_pred)
acc_dt_kk = accuracy_score(y_kk_test, y_dt_kk_pred)
f1_dt_kk = f1_score(y_kk_test, y_dt_kk_pred)
rec_dt_kk = recall_score(y_kk_test, y_dt_kk_pred)
prec_dt_kk = precision_score(y_kk_test, y_dt_kk_pred)
print("confusion matrix: \n", cm_dt_kk)
print("accuracy: \n", acc_dt_kk)
print("f1: \n", f1_dt_kk)
print("recall: \n", rec_dt_kk)
print("precision: \n", prec_dt_kk)

### ロジスティック回帰

In [14]:
import statsmodels.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
# 学習
logit = sm.Logit(y_kk_train, X_kk_train)
result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.280579
         Iterations 9


0,1,2,3
Dep. Variable:,is_churn,No. Observations:,148716.0
Model:,Logit,Df Residuals:,148679.0
Method:,MLE,Df Model:,36.0
Date:,"Mon, 18 Jun 2018",Pseudo R-squ.:,0.4905
Time:,12:16:25,Log-Likelihood:,-41727.0
converged:,True,LL-Null:,-81902.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
is_cancel,0.9681,0.067,14.388,0.000,0.836,1.100
cumul_purchase_num,-0.1865,0.002,-80.956,0.000,-0.191,-0.182
transaction_freq,-0.1736,0.010,-17.844,0.000,-0.193,-0.155
valid_days,0.0025,0.000,9.042,0.000,0.002,0.003
price_mean,-0.0124,0.001,-15.335,0.000,-0.014,-0.011
paid_mean,0.0080,0.000,17.399,0.000,0.007,0.009
plan_days_mean,0.0266,0.003,8.849,0.000,0.021,0.033
plan_days_mode,-0.0059,0.002,-2.412,0.016,-0.011,-0.001
price_mode,0.0007,0.001,1.177,0.239,-0.000,0.002


In [15]:
from sklearn.linear_model import LogisticRegression

* LogisticRegression
    * random_state: 乱数のシード
    * C: 正則化項。C が小さいほど正則化が強くなる。今回は不要なため, Cを十分を大きくして正則化を無効にする。

In [16]:
# 学習
reg_logi_kk = LogisticRegression(random_state=0, C=100000000)
reg_logi_kk.fit(X=X_kk_train, y=y_kk_train)

LogisticRegression(C=100000000, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [17]:
# 学習結果
reg_logi_kk.coef_

array([[ 2.33145969e-01, -1.98411335e-01, -8.41798018e-02,
         1.09461899e-03, -1.30872782e-02,  1.00547789e-02,
         2.16903230e-02, -1.04689175e-03, -7.36666549e-04,
         3.52811477e-06, -4.56295530e-03, -2.43160028e-01,
         1.72479071e-02, -5.95514161e-01,  3.98523248e-01,
         1.84089479e-01, -5.04920796e-02, -3.40126957e-01,
         3.05898794e-01,  3.12812393e-01, -2.86081651e-01,
         1.64155323e-01,  4.92713974e-02,  3.60756895e-01,
        -5.35748621e-02, -8.77694651e-01,  5.78135070e-02,
        -1.66819638e-01,  7.40933476e-02,  1.49343107e-01,
         2.05858088e+00, -3.08621185e-01, -7.87319371e-01,
         9.35469244e-01,  1.39442309e+00, -1.65500967e+00,
         5.19295515e-01]])

In [18]:
# 予測
mat_logi_kk_prob = reg_logi_kk.predict_proba(X_kk_test) # 予測確率
y_logi_kk_pred = reg_logi_kk.predict(X_kk_test) # 予測クラス

In [19]:
# 混同行列・正解率確認
cm_logi_kk = confusion_matrix(y_kk_test, y_logi_kk_pred)
acc_logi_kk = accuracy_score(y_kk_test, y_logi_kk_pred)
f1_logi_kk = f1_score(y_kk_test, y_logi_kk_pred)
rec_logi_kk = recall_score(y_kk_test, y_logi_kk_pred)
prec_logi_kk = precision_score(y_kk_test, y_logi_kk_pred)
print("confusion matrix: \n", cm_logi_kk)
print("accuracy: \n", acc_logi_kk)
print("f1: \n", f1_logi_kk)
print("recall: \n", rec_logi_kk)
print("precision: \n", prec_logi_kk)

confusion matrix: 
 [[5689 3210]
 [ 421 8478]]
accuracy: 
 0.7959883132936285
f1: 
 0.8236265604507698
recall: 
 0.952691313630745
precision: 
 0.7253593429158111


## ランダムフォレスト

In [24]:
from sklearn.ensemble import RandomForestClassifier

* RandomForestRegressor: 
    * random_state: 乱数のシード
    * n_estimators: 木の数

In [25]:
reg_rf_kk = RandomForestClassifier(random_state=0, n_estimators=500)
reg_rf_kk.fit(X=X_kk_train, y=y_kk_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [26]:
# 予測
y_rf_kk_pred = reg_rf_kk.predict(X_kk_test)

In [27]:
# 混同行列・正解率確認
cm_rf_kk = confusion_matrix(y_kk_test, y_rf_kk_pred)
acc_rf_kk = accuracy_score(y_kk_test, y_rf_kk_pred)
f1_rf_kk = f1_score(y_kk_test, y_rf_kk_pred)
rec_rf_kk = recall_score(y_kk_test, y_rf_kk_pred)
prec_rf_kk = precision_score(y_kk_test, y_rf_kk_pred)
print("confusion matrix: \n", cm_rf_kk)
print("accuracy: \n", acc_rf_kk)
print("f1: \n", f1_rf_kk)
print("recall: \n", rec_rf_kk)
print("precision: \n", prec_rf_kk)

confusion matrix: 
 [[6573 2326]
 [ 420 8479]]
accuracy: 
 0.8457130014608383
f1: 
 0.8606374340235485
recall: 
 0.952803685807394
precision: 
 0.784729291994447


# LightGBM

In [28]:
import lightgbm as lgb

In [29]:
# 学習
clf_lgb_kk = lgb.LGBMClassifier()
clf_lgb_kk.fit(X_kk_train, y_kk_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

In [30]:
y_lgb_kk_pred = clf_lgb_kk.predict(X_kk_test)

  if diff:


In [31]:
# 混同行列・正解率確認
cm_lgb_kk = confusion_matrix(y_kk_test, y_lgb_kk_pred)
acc_lgb_kk = accuracy_score(y_kk_test, y_lgb_kk_pred)
f1_lgb_kk = f1_score(y_kk_test, y_lgb_kk_pred)
rec_lgb_kk = recall_score(y_kk_test, y_lgb_kk_pred)
prec_lgb_kk = precision_score(y_kk_test, y_lgb_kk_pred)
print("confusion matrix: \n", cm_lgb_kk)
print("accuracy: \n", acc_lgb_kk)
print("f1: \n", f1_lgb_kk)
print("recall: \n", rec_lgb_kk)
print("precision: \n", prec_lgb_kk)

confusion matrix: 
 [[6613 2286]
 [ 392 8507]]
accuracy: 
 0.8495336554669064
f1: 
 0.8640056875888686
recall: 
 0.9559501067535678
precision: 
 0.7881960529973131


# XGBoost

In [32]:
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# 学習
clf_xgb_kk = xgb.XGBClassifier()
clf_xgb_kk.fit(X_kk_train, y_kk_train)

In [None]:
# 予測値
y_xgb_kk_pred = clf_xgb_kk.predict(X_kk_test)

In [None]:
# 混同行列・正解率確認
cm_xgb_kk = confusion_matrix(y_kk_test, y_xgb_kk_pred)
acc_xgb_kk = accuracy_score(y_kk_test, y_xgb_kk_pred)
f1_xgb_kk = f1_score(y_kk_test, y_xgb_kk_pred)
rec_xgb_kk = recall_score(y_kk_test, y_xgb_kk_pred)
prec_xgb_kk = precision_score(y_kk_test, y_xgb_kk_pred)
print("confusion matrix: \n", cm_xgb_kk)
print("accuracy: \n", acc_xgb_kk)
print("f1: \n", f1_xgb_kk)
print("recall: \n", rec_xgb_kk)
print("precision: \n", prec_xgb_kk)