# ランダムフォレスト調整
- 特徴量の重要度の選択
- ハイパーパラメータの調整

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data = pd.read_csv('~/kaggle_data/02digit_recognizer/train.csv')

In [6]:
X = data.iloc[:,1:].values
y = data.iloc[:,0].values

- 特徴量の重要度にアクセス

In [29]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1)
feat_labels = data.columns[1:]
%time forest.fit(X,y)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

CPU times: user 23.4 s, sys: 84.4 ms, total: 23.5 s
Wall time: 24.1 s


In [16]:
feat_labels = data.columns[1:]
for f in range(X.shape[1]):
    #if importances[indices[f]] ==.0:
    print('%2d,%s,%.30f') %(f+1,feat_labels[indices[f]],importances[indices[f]])

 1,pixel350,0.010150603137186043137263347091
 2,pixel409,0.009815821391601902562440429278
 3,pixel433,0.008984217911958372604352973667
 4,pixel461,0.008838468596647474848082559618
 5,pixel378,0.008756761048077382411647384686
 6,pixel406,0.008339446877201939917201301000
 7,pixel210,0.007386499870016638089054161753
 8,pixel488,0.007317019841156389460545828030
 9,pixel377,0.007187484074922760764192464222
10,pixel155,0.007090265351055360237841895810
11,pixel375,0.007071027788463274399433089457
12,pixel346,0.006952469737110884666353438632
13,pixel291,0.006367131307605130435822715640
14,pixel347,0.006270633664443161364721035511
15,pixel515,0.006241156642774648051485986144
16,pixel318,0.006108155631369205709746417199
17,pixel489,0.005932823858791483502550523355
18,pixel437,0.005894932667892831615330528905
19,pixel405,0.005884119105418829202835961922
20,pixel430,0.005651768389112303732357833752
21,pixel152,0.005643998079489422753374228137
22,pixel569,0.005623490302464261188231109401
23,pixel51

- 659番目以降の特徴量は重要度が0。
- 重要度が0だったカラムは、全データ値が同じカラムなので、これらは外す。

In [26]:
X_new = X[:,(importances > 0.)]
X_new.shape

(42000, 658)

- 特徴量を約130列削減して、658列にした。

# gridsearch
- gridseachしたいけど、あんまりパラメータを複雑にすると時間が掛かり過ぎるでしょう。
- n_estimators=100で23秒でした。
- 5分以内に終わらせたかったら、CV=5のときパラメータは3つくらいしか探索できない。
- 数字事のデータ個数に偏りなかったら、1万件くらいに減らす

In [50]:
#数字事のデータ個数
data['label'].value_counts().sort_index() / 4.

0    1033.00
1    1171.00
2    1044.25
3    1087.75
4    1018.00
5     948.75
6    1034.25
7    1100.25
8    1015.75
9    1047.00
Name: label, dtype: float64

- ランダムに抽出して1000個前後個数があればよしとしてしまいましょう

In [43]:
#本来の使い方とは違うけどめんどくさいからこれでいいや。
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,ytest = train_test_split(X_new,y,test_size=0.75,random_state=0)

In [45]:
np.bincount(y_train)

array([1040, 1167, 1058, 1060,  994,  940, 1087, 1103,  997, 1054])

- 良い感じ！

In [58]:
from sklearn.grid_search import GridSearchCV
forest = RandomForestClassifier(random_state=0)
param_grid = {'n_estimators':[50,100,150],
             'max_depth': [5,10,15,20,30]
              }
gs = GridSearchCV(forest,
                 param_grid=param_grid,
                 cv=5,n_jobs=-1)

%time gs.fit(X_train,y_train)

print('Best score: %.3f') % gs.best_score_
print('Best parameters:%s') % gs.best_params_

CPU times: user 4min 6s, sys: 1.23 s, total: 4min 7s
Wall time: 4min 55s
Best score: 0.948
Best parameters:{'n_estimators': 150, 'max_depth': 20}


- 予想通りおおよそ5分で終えれているね。
- もういっちょ

In [59]:
from sklearn.grid_search import GridSearchCV
forest = RandomForestClassifier(random_state=0)
param_grid = {'n_estimators':[150,200],
             'max_depth': [18,19,20,21,22]
              }
gs = GridSearchCV(forest,
                 param_grid=param_grid,
                 cv=5,n_jobs=-1)

%time gs.fit(X_train,y_train)

print('Best score: %.3f') % gs.best_score_
print('Best parameters:%s') % gs.best_params_

CPU times: user 5min 35s, sys: 1.41 s, total: 5min 37s
Wall time: 6min 11s
Best score: 0.948
Best parameters:{'n_estimators': 200, 'max_depth': 20}


- n_estimatore=200,max_depth=20で提出してみましょう！

In [60]:
#学習
forest = RandomForestClassifier(n_estimators=200,max_depth=20,n_jobs=-1,random_state=0)
forest.fit(X_new,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [64]:
test = pd.read_csv('~/kaggle_data/02digit_recognizer/test.csv')
X_test = test.values[:,(importances > 0.)]
ImageId = np.arange(1,test.shape[0]+1,1)

In [65]:
predictions = forest.predict(X_test)

In [66]:
submission = pd.DataFrame({'ImageId':ImageId,
                          'Label':predictions})
submission.to_csv('submission.csv',index=False)

スコア0.96629で722位まであがりました！うしっ！

ただ、絶対伸び悩むね。うん。

あと3〜4回ぐらい、別の分類器でわちゃわちゃやって、すげー人の記事でも読むか。