In [1]:
import pandas as pd
import numpy as np

# 可視化用のライブラリ
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

#前処理用ライブラリ
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

# 機械学習モデル関連ライブラリ
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors


# モデル評価関連ライブラリ
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, roc_curve

In [2]:
df = pd.read_pickle('obento_int_type.pkl')
df.head()

Unnamed: 0,y,soldout,temperature,year,month,day
0,90,0,19.8,2013,11,18
1,101,1,17.0,2013,11,19
2,118,0,15.5,2013,11,20
3,120,1,15.2,2013,11,21
4,130,1,16.1,2013,11,22


## 学習データとテストデータに分割

In [3]:
# 順番注意：train_X, test_X, train_y, test_y
# [\](バックスラッシュ)はMacの場合[option(alt)]+[¥]を入力
train_X, test_X, train_y, test_y = \
train_test_split(df.drop(['soldout'], axis=1),\
                 df['soldout'], test_size=0.2, random_state=0)
print(train_X.shape)
print(test_X.shape)
print(train_y.shape)
print(test_y.shape)

(165, 5)
(42, 5)
(165,)
(42,)



## モデル作成
今回はランダムフォレストを使用し、パラメータ調整はしない



In [4]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=30)# 今回はmax_depth=30を除外

## 交差検証
学習データをさらに学習データと検証データに分割 検証データをパラメータ調整用にする(
今回はパラメータ調整しない)


http://www.procrasist.com/entry/10-cross-validation

scoringの引数に与えることができる評価尺度一覧

Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 
                   'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 
                   'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 
                   'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 
                   'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 
                   'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 
                   'recall_weighted', 'roc_auc', 'v_measure_score']

In [5]:
# 層化k分割交差検証
stratifiedkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

scores = cross_validate(clf, train_X, train_y, cv=stratifiedkfold, scoring=['precision', 'recall', 'accuracy', 'f1'])
scores_df = pd.DataFrame(scores)
scores_df.head()



Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall,train_accuracy,train_f1,train_precision,train_recall
0,0.14129,0.455303,0.5,0.451613,0.4375,0.466667,0.984733,0.982456,1.0,0.965517
1,0.150057,0.49996,0.647059,0.647059,0.578947,0.733333,0.992366,0.991304,1.0,0.982759
2,0.159358,0.475732,0.424242,0.24,0.3,0.2,0.992424,0.991304,1.0,0.982759
3,0.189585,0.490374,0.5625,0.5,0.5,0.5,1.0,1.0,1.0,1.0
4,0.142177,0.472961,0.53125,0.347826,0.444444,0.285714,1.0,1.0,1.0,1.0


### 本来はおそらくここでグリッドサーチ
モデルのパラメータ調整

## 学習

In [6]:
clf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## 予測

In [7]:
pred_y = clf.predict(test_X)
pred_y

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

## 正解率・適合率・再現率・F値を算出


In [8]:
# 混同行列を出力

confusion_matrix(test_y, pred_y)
tn, fp, fn, tp = confusion_matrix(test_y, pred_y).ravel()
# (tp, fn, fp, tn)に順番を変更、それを行列にする
print(np.array([[tp, fn], [fp, tn]]))
print('--------------------------------')
# 二次元配列の場合はnumpy.matrix関数でも表現可能
print(np.matrix([[tp, fn], [fp, tn]]))

[[ 7 13]
 [10 12]]
--------------------------------
[[ 7 13]
 [10 12]]


In [9]:
# 小数第2位まで出力
print('正解率は%.2fです。' % accuracy_score(test_y, pred_y) )
print('適合率は%.2fです。' % precision_score(test_y, pred_y))
print('再現率は%.2fです。' %recall_score(test_y, pred_y))
print('F値は: %.2fです。' % f1_score(test_y, pred_y))

正解率は0.45です。
適合率は0.41です。
再現率は0.35です。
F値は: 0.38です。


In [10]:
# 適合率・検出率・F値をまとめて表示するsklearn.metrics.classification_report
# 出力部分の「0」「1」は今回の場合「ネガティブ」「ポジティブ」という意味、「support」は「正解ラベルのデータの数」を意味する
print(classification_report(test_y, pred_y))

             precision    recall  f1-score   support

          0       0.48      0.55      0.51        22
          1       0.41      0.35      0.38        20

avg / total       0.45      0.45      0.45        42

