In [1]:
# データ分析用のライブラリ
import pandas as pd
import numpy as np

#前処理用ライブラリ
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

# 機械学習モデル関連ライブラリ
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors

# モデル評価関連ライブラリ
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, roc_curve

In [2]:
# カラムのみが格納されたテーブルを読み込み

df = pd.read_excel('visit_access_log_190217.xlsx')
df

Unnamed: 0,visit_id,gv_jptop_flg,top_banner_click_flg,gv_bikelineup_flg,gv_bike_contents_flg,gv_bike_detail_flg,gv_welcab_flg,gv_gr_flg,gv_customize_flg,gv_webcatalog_flg,...,buy_used_bike_count,buy_new_bike_flg,buy_used_bike_flg,total_visit_count_before_buy,total_gv_bike_count_before_buy,landing_page_scroll_rate,landing_gv_time,is_buy_flg,is_before_buy_flg,is_after_buy_flg


In [3]:
'''カラム名末尾が「_flg」となっているカラムのみ抽出し、リストに格納する'''
flg_df = df.filter(regex='_flg$').drop(['is_before_buy_flg','is_after_buy_flg'], axis=1)
flg_col_list = flg_df.columns.tolist()

In [4]:
len(flg_col_list)

62

## 乱数を使用したデータフレームを作成

In [5]:
# 乱数を使用して10000×62のデータフレームを作成
np.random.seed(0)

log_df = pd.DataFrame(np.random.randint(low=0, high=2, size=(10000, 62)), columns=flg_col_list)
log_df['visit_id'] = range(len(log_df))

display(log_df.shape, log_df.head())

(10000, 63)

Unnamed: 0,gv_jptop_flg,top_banner_click_flg,gv_bikelineup_flg,gv_bike_contents_flg,gv_bike_detail_flg,gv_welcab_flg,gv_gr_flg,gv_customize_flg,gv_webcatalog_flg,gv_dop_childseat_flg,...,cv_tradein_simulation_flg,cv_maker_catalog_entry_flg,cv_dealer_catalog_entry_flg,cv_test_drive_flg,cv_talk_reservation_flg,cv_adv_talk_reservation_flg,buy_new_bike_flg,buy_used_bike_flg,is_buy_flg,visit_id
0,0,1,1,0,1,1,1,1,1,1,...,1,1,0,1,0,0,1,1,0,0
1,1,0,1,0,0,0,0,0,1,1,...,0,1,0,1,0,1,1,1,1,1
2,1,0,1,1,1,1,0,1,1,0,...,0,0,0,1,0,1,0,1,0,2
3,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,1,1,1,1,0,3
4,0,0,1,1,1,0,1,1,1,1,...,0,1,0,0,1,1,1,1,1,4


# k-meansを実行
参考：https://qiita.com/maskot1977/items/34158d044711231c4292

In [6]:
def execute_kmeans(df, num, s_col_num, e_col_num, new_col):
    '''
    関数内容
    ・k-meansを実行し、実行結果を新規作成カラムに格納する関数
    Input
    ・df：データフレーム
    ・num：クラスタ数
    ・s_col_num：データフレームの列の左端の列番号
    ・e_col_num：データフレームの列の右端の列番号
    ・new_col：k-meansの実行結果を格納するカラム名
    
    '''
    
    from sklearn.cluster import KMeans
    
    pred = KMeans(n_clusters=num).fit_predict(df.iloc[:, s_col_num:e_col_num])

    df[new_col] = pred
    
    return df

In [7]:
new_log_df = execute_kmeans(log_df, 6, 0, -1, 'cluster')

new_log_df.head(2)

Unnamed: 0,gv_jptop_flg,top_banner_click_flg,gv_bikelineup_flg,gv_bike_contents_flg,gv_bike_detail_flg,gv_welcab_flg,gv_gr_flg,gv_customize_flg,gv_webcatalog_flg,gv_dop_childseat_flg,...,cv_maker_catalog_entry_flg,cv_dealer_catalog_entry_flg,cv_test_drive_flg,cv_talk_reservation_flg,cv_adv_talk_reservation_flg,buy_new_bike_flg,buy_used_bike_flg,is_buy_flg,visit_id,cluster
0,0,1,1,0,1,1,1,1,1,1,...,1,0,1,0,0,1,1,0,0,4
1,1,0,1,0,0,0,0,0,1,1,...,1,0,1,0,1,1,1,1,1,2


In [8]:
new_log_df['cluster'].value_counts()

2    1715
5    1703
1    1663
4    1652
0    1641
3    1626
Name: cluster, dtype: int64

## 説明変数と目的変数を設定

In [9]:
X = new_log_df.drop(['cluster'], axis=1)
y = new_log_df['cluster']

## 学習データとテストデータに分割

In [10]:
# X[train_index]→X.loc[train_index]に変更すればエラーなく実行可能に
# https://stackoverflow.com/questions/51091132/pandas-and-scikit-learn-keyerror-not-in-index
stratifiedKFold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for train_index, test_index in stratifiedKFold.split(X, y):
    train_X, test_X = X.loc[train_index], X.loc[test_index]
    train_y, test_y = y.loc[train_index], y.loc[test_index]

## 学習
ランダムフォレストにおける各パラメータの意味はhttps://data-science.gr.jp/implementation/iml_sklearn_random_forest.html

In [11]:
model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=0).fit(train_X, train_y)

## 予測

In [12]:
pred_y = model.predict(test_X)
pred_y[0:5]

array([5, 3, 2, 2, 4])

In [13]:
# 正解率
model.score(test_X, test_y)

0.6676676676676677

## 混同行列を算出

In [14]:
confusion_matrix(test_y, pred_y)

array([[226,  15,  24,  13,  27,  23],
       [ 22, 205,  42,  13,  31,  19],
       [ 12,  22, 262,  10,  15,  22],
       [ 18,  24,  26, 208,  24,  25],
       [ 22,  38,  35,  20, 201,  14],
       [ 22,  16,  27,  21,  22, 232]], dtype=int64)

## 正解率・適合率・再現率・F値を算出

In [15]:
# 適合率・検出率・F値をまとめて表示するsklearn.metrics.classification_report
# 出力部分の「0」～「5」はラベル、「support」は「各ラベルのデータ数」を意味する
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       0.70      0.69      0.70       328
           1       0.64      0.62      0.63       332
           2       0.63      0.76      0.69       343
           3       0.73      0.64      0.68       325
           4       0.63      0.61      0.62       330
           5       0.69      0.68      0.69       340

   micro avg       0.67      0.67      0.67      1998
   macro avg       0.67      0.67      0.67      1998
weighted avg       0.67      0.67      0.67      1998



##### 参考：https://github.com/hwpwk/Frequent-expression-in-Python/blob/680bd8cb5f818bb7bc756ed70529b32e85422340/%E7%96%91%E4%BC%BC%E3%82%A2%E3%82%AF%E3%82%BB%E3%82%B9%E3%83%AD%E3%82%B0%E3%81%8B%E3%82%89%E4%BA%88%E6%B8%AC%E3%83%A2%E3%83%87%E3%83%AB%E4%BD%9C%E6%88%90_190217.ipynb