In [36]:
import numpy as np
import pandas as pd
from typing import List, Any, Union, Dict
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import pprint
from sklearn.ensemble import ExtraTreesClassifier
import joblib

# CSVファイルからデータを読み込む

In [2]:
CSV_PATH = "../out/csv/"

In [3]:
def CsvToDf(csv_list: List) -> pd.DataFrame:
    df = pd.concat([pd.read_csv(CSV_PATH + filename) for filename in csv_list])
    return df

In [4]:
# 読み込むCSVファイルのリスト
csv_list = ["features_ch0.csv"]
df = CsvToDf(csv_list)

In [5]:
# session1のデータの抽出(式を評価するengineとしてnumexprを使用することで、処理の高速化を狙う。)
train_set = df.query('session_id == "trial1"', engine='numexpr')
test_set = df.query('session_id == "trial2"', engine='numexpr')

# データのクリーニング

In [6]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    '''
    DataFrameから、属性を選択し、Numpy Arrayへ変換する変換器。
    '''

    def __init__(self, attribute_names: List) -> None:
        self.attribute_names = attribute_names

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        return X[self.attribute_names].to_numpy()

In [7]:
class ValueTransducer(BaseEstimator, TransformerMixin):
    '''
    値を他の値に変換する。
    '''

    def __init__(self, vals) -> None:
        self.vals = vals

    def fit(self, X: np.ndarray, y=None):
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        X_ = []
        # Trueになる値->1, それ以外->0
        for val in self.vals:
            x_ = np.where(X == val, 1, 0)
            X_.append(x_)
        # ndarrayに変換
        X_ = np.array(X_)

        # 1次元配列にまとめる
        label_list = []
        for i in range(len(X)):
            bool = X_[:, i].any()
            label_list.append(int(bool))
        # ndarrayに変換
        label_np = np.array(label_list)

        return label_np

## 特徴量抽出パイプライン

In [8]:
# 特徴量に該当する列の抽出
feature_attribs = ['low_power', 'high_power', 'hlbr', 'coe1[0]',	'coe1[1]', 'coe3[0]', 'coe3[1]', 'coe3[2]',	'coe3[3]', 'ratio_max_to_10ms_ave_peaks', 'ratio_max_to_9th_ave_peaks', 'ac_std', 'ac_auc', 'diff_std', 'diff_auc',	'srmr', 'gp_max_val_std', 'gp_max_val_range',
                       'gp_max_val_min',	'gp_max_val_max',	'gp_max_val_mean', 'gp_max_ix_std',	'gp_max_ix_range', 'gp_max_ix_min',	'gp_max_ix_max', 'gp_max_ix_mean', 'gp_auc_std', 'gp_auc_range', 'gp_auc_min', 'gp_auc_max', 'gp_auc_mean',	'tdoa_std', 'tdoa_range', 'tdoa_min', 'tdoa_max', 'tdoa_mean']

In [9]:
# pipeline
features_pipeline = Pipeline([
    ('selector', DataFrameSelector(feature_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
#     ('min_max_scaler', MinMaxScaler())
])

In [10]:
X_train = features_pipeline.fit_transform(train_set)

In [11]:
X_test = features_pipeline.fit_transform(test_set)

## ラベル抽出パイプライン

In [12]:
# 正解値labelを生成するpipeline
label_attrib = ['dov_angle']
facing_dov_angles = [0, 45, 315]

In [13]:
label_pipeline = Pipeline([
    ('selector', DataFrameSelector(label_attrib)),
    ('transducer', ValueTransducer(facing_dov_angles))
])

In [14]:
y_train = label_pipeline.fit_transform(train_set)
y_test = label_pipeline.fit_transform(test_set)

In [15]:
type(y_test)

numpy.ndarray

# 訓練

## モデルの微調整

In [16]:
X_train.shape[0]

5760

### Grid Search

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
param_grid = [
]

### Random Search

In [19]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

#### ペースティング

In [24]:
# ハイパーパラメータの設定
param_distribs_paceting = {
    'n_estimators': randint(low=1, high=800),
    'min_samples_split': randint(low=2, high=10),
    'min_samples_leaf': randint(low=1, high=10),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [False],
    'n_jobs': [-1],
    'random_state': [42],
    'max_samples': uniform()
}

In [22]:
# バギングでの探索
ext_clf_bagging = ExtraTreesClassifier()
ext_rnd_search_bagging = RandomizedSearchCV(ext_clf_bagging, param_distributions=param_distribs_bagging, n_iter=100, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)

In [23]:
ext_rnd_search_bagging.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=ExtraTreesClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [False],
                                        'max_features': ['sqrt', 'log2', None],
                                        'max_samples': <scipy.stats._distn_infrastructure.rv_frozen object at 0x13f699d90>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x13f6f70d0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10b35b7c0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x13f6ef0a0>,
                                        'n_jobs': [-1], 'random_state': [42]},
                   random_state=42, scoring='accuracy')

In [32]:
# ハイパーパラメータの組み合わせを確認する
cvres_paceting = ext_rnd_search_bagging.cv_results_
pd.DataFrame(cvres_paceting)[:10]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_features,param_max_samples,param_min_samples_leaf,param_min_samples_split,param_n_estimators,...,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.135326,0.019287,0.016046,0.003662,False,,0.796543,8,6,21,...,42,"{'bootstrap': False, 'max_features': None, 'ma...",0.600694,0.729167,0.726562,0.727431,0.717014,0.700174,0.049919,3
1,5.59687,0.126102,0.185945,0.03801,False,,0.445833,7,4,459,...,42,"{'bootstrap': False, 'max_features': None, 'ma...",0.578993,0.727431,0.726562,0.742188,0.713542,0.697743,0.060064,22
2,4.502934,0.124129,0.296484,0.04356,False,sqrt,0.601115,8,4,662,...,42,"{'bootstrap': False, 'max_features': 'sqrt', '...",0.639757,0.71875,0.734375,0.71875,0.678819,0.69809,0.03448,14
3,0.536934,0.055558,0.071699,0.005301,False,sqrt,0.96991,6,3,192,...,42,"{'bootstrap': False, 'max_features': 'sqrt', '...",0.622396,0.719618,0.730035,0.717882,0.679688,0.693924,0.039641,89
4,6.717449,0.320695,0.414396,0.060093,False,sqrt,0.617482,6,6,748,...,42,"{'bootstrap': False, 'max_features': 'sqrt', '...",0.632812,0.71875,0.731771,0.717014,0.680556,0.696181,0.035971,57
5,5.474162,0.204726,0.210238,0.020409,False,sqrt,0.291229,3,5,567,...,42,"{'bootstrap': False, 'max_features': 'sqrt', '...",0.618056,0.726562,0.730035,0.725694,0.684028,0.696875,0.04287,45
6,0.257993,0.023321,0.049923,0.029551,False,sqrt,0.04645,3,8,21,...,42,"{'bootstrap': False, 'max_features': 'sqrt', '...",0.624132,0.731771,0.723958,0.732639,0.699653,0.702431,0.040927,1
7,2.877515,0.083578,0.116862,0.002419,False,sqrt,0.065052,4,2,316,...,42,"{'bootstrap': False, 'max_features': 'sqrt', '...",0.624132,0.720486,0.730903,0.72309,0.681424,0.696007,0.039822,58
8,4.637125,0.195115,0.218628,0.017667,False,log2,0.808397,9,3,565,...,42,"{'bootstrap': False, 'max_features': 'log2', '...",0.652778,0.71875,0.728299,0.716146,0.675347,0.698264,0.029106,13
9,3.871631,0.204509,0.182038,0.013626,False,log2,0.684233,7,5,509,...,42,"{'bootstrap': False, 'max_features': 'log2', '...",0.646701,0.717882,0.730035,0.717882,0.674479,0.697396,0.031622,34


In [33]:
# 最良の分類器のハイパーパラメータを確認する
ext_rnd_search_bagging.best_params_

{'bootstrap': False,
 'max_features': 'sqrt',
 'max_samples': 0.046450412719997725,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 21,
 'n_jobs': -1,
 'random_state': 42}

In [37]:
ext_rnd_search_bagging.best_score_

0.7024305555555556

In [39]:
# モデルの保存
ext_best_estimator_paceting = ext_rnd_search_bagging.best_estimator_
joblib.dump(ext_best_estimator_paceting, './pkl/ext_best_estimator_paceting_211002_1.pkl')

['./pkl/ext_best_estimator_paceting_211002_1.pkl']

#### バギング

In [46]:
# ハイパーパラメータの設定
param_distribs_bagging = {
    'n_estimators': randint(low=100, high=800),
    'min_samples_split': randint(low=2, high=10),
    'min_samples_leaf': randint(low=1, high=10),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True],
    'oob_score': [True, False],
    'n_jobs': [-1],
    'random_state': [42],
    'max_samples': uniform()
}

In [47]:
# バギングでの探索
ext_clf_bagging = ExtraTreesClassifier()
ext_rnd_search_bagging = RandomizedSearchCV(ext_clf_bagging, param_distributions=param_distribs_bagging, n_iter=100, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
ext_rnd_search_bagging.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=ExtraTreesClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True],
                                        'max_features': ['sqrt', 'log2', None],
                                        'max_samples': <scipy.stats._distn_infrastructure.rv_frozen object at 0x141b90af0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x141b90160>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1419549d0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x13f73aa90>,
                                        'n_jobs': [-1],
                                        'oob_score': [True, False],
                                        'random_state': [42]},
                   random_state=42, scoring='accuracy')

In [48]:
ext_rnd_search_bagging.best_params_

{'bootstrap': True,
 'max_features': 'log2',
 'max_samples': 0.02535074341545751,
 'min_samples_leaf': 5,
 'min_samples_split': 7,
 'n_estimators': 226,
 'n_jobs': -1,
 'oob_score': True,
 'random_state': 42}

In [49]:
ext_rnd_search_bagging.best_score_

0.7064236111111112

## モデルの訓練

In [None]:
ext_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [None]:
ext_clf.fit(X_train, y_train)

## 特徴量の重要度

In [None]:
feature_score = {}
for name, score in zip(feature_attribs, ext_clf.feature_importances_):
    feature_score[name] = score
    
# sort
feature_score_sorted = sorted(feature_score.items(), key=lambda x:x[1], reverse=True)
pprint.pprint(feature_score_sorted)

## Out-Of-Bag検証

Extra-tree

In [None]:
ext_clf.oob_score_

# 予測

In [None]:
y_pred_ext = ext_clf.predict(X_test)

# 検証

## 訓練セットでの検証

### 平均正解率

In [None]:
ext_clf.score(X_train, y_train)

### 交差検証

分類器の評価においては、歪んだデータセット、すなわち一部のクラスが他のクラスよりも出現度の高いデータセットの正解率は当てにならない。（今回のデータセットはそうでもない気はする。）

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(ext_clf, X_train, y_train, cv=3, scoring='accuracy')

### 混同行列

分類器の性能評価は混同行列を用いる。

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
# K分割交差検証を行い、個々のテストフォールドに対する予測結果を返す。（評価のスコアではない）
y_train_pred = cross_val_predict(ext_clf, X_train, y_train, cv=3)
y_train_pred

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# 混同行列
confusion_matrix(y_train, y_train_pred)

In [None]:
from sklearn.metrics import f1_score

In [None]:
# F1スコアの計算
f1_score(y_train, y_train_pred)

## テストセットでの検証

In [None]:
ext_clf.score(X_test, y_test)