In [70]:
import numpy as np
import pandas as pd
from typing import List, Any, Union, Dict
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import pprint
from sklearn.ensemble import ExtraTreesClassifier
import joblib
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
import os, re

# CSVファイルからデータを読み込む

In [14]:
CSV_PATH = "../out/csv/"

In [15]:
def CsvToDf(csv_list: List) -> pd.DataFrame:
    df = pd.concat([pd.read_csv(CSV_PATH + filename) for filename in csv_list])
    return df

In [16]:
# 読み込むCSVファイルのリスト
csv_list = ["fearures_mono_ch_complete_211007.csv"]
df = CsvToDf(csv_list)

In [17]:
# session1のデータの抽出(式を評価するengineとしてnumexprを使用することで、処理の高速化を狙う。)
train_set = df.query('session_id == "trial1"', engine='numexpr')
test_set = df.query('session_id == "trial2"', engine='numexpr')

# データのクリーニング

In [18]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    '''
    DataFrameから、属性を選択し、Numpy Arrayへ変換する変換器。
    '''

    def __init__(self, attribute_names: List) -> None:
        self.attribute_names = attribute_names

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        return X[self.attribute_names].to_numpy()

In [19]:
class ValueTransducer(BaseEstimator, TransformerMixin):
    '''
    値を他の値に変換する。
    '''

    def __init__(self, vals) -> None:
        self.vals = vals

    def fit(self, X: np.ndarray, y=None):
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        X_ = []
        # Trueになる値->1, それ以外->0
        for val in self.vals:
            x_ = np.where(X == val, 1, 0)
            X_.append(x_)
        # ndarrayに変換
        X_ = np.array(X_)

        # 1次元配列にまとめる
        label_list = []
        for i in range(len(X)):
            bool = X_[:, i].any()
            label_list.append(int(bool))
        # ndarrayに変換
        label_np = np.array(label_list)

        return label_np

## 特徴量抽出パイプライン

In [20]:
# 特徴量に該当する列の抽出
feature_attribs = ['low_power', 'high_power', 'hlbr', 'coe1[0]',	'coe1[1]', 'coe3[0]', 'coe3[1]', 'coe3[2]',	'coe3[3]', 'ratio_max_to_10ms_ave_peaks', 'ratio_max_to_9th_ave_peaks', 'ac_std', 'ac_auc', 'diff_std', 'diff_auc',	'srmr', 'gp_max_val_std', 'gp_max_val_range',
                       'gp_max_val_min', 'gp_max_val_max', 'gp_max_val_mean', 'gp_max_ix_std', 'gp_max_ix_range', 'gp_max_ix_min', 'gp_max_ix_max', 'gp_max_ix_mean', 'gp_auc_std', 'gp_auc_range', 'gp_auc_min', 'gp_auc_max', 'gp_auc_mean',	'tdoa_std', 'tdoa_range', 'tdoa_min', 'tdoa_max', 'tdoa_mean']

In [21]:
# pipeline
features_pipeline = Pipeline([
    ('selector', DataFrameSelector(feature_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
#     ('min_max_scaler', MinMaxScaler())
])

In [22]:
X_train = features_pipeline.fit_transform(train_set)

In [23]:
X_test = features_pipeline.fit_transform(test_set)

## ラベル抽出パイプライン

In [24]:
# 正解値labelを生成するpipeline
label_attrib = ['dov_angle']
facing_dov_angles = [0, 45, 315]

In [25]:
label_pipeline = Pipeline([
    ('selector', DataFrameSelector(label_attrib)),
    ('transducer', ValueTransducer(facing_dov_angles))
])

In [26]:
y_train = label_pipeline.fit_transform(train_set)
y_test = label_pipeline.fit_transform(test_set)

# 訓練

In [27]:
# 交差検証の結果を表示する
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

## Extra-Tree

### Trial 1

#### 訓練

In [28]:
from sklearn.model_selection import GridSearchCV

In [19]:
param_grid = [
    {'n_estimators': [2, 10, 100, 1000], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 5, 10], 'max_features': ['sqrt', 'log2', None], 'bootstrap': [False], 'n_jobs': [-1], 'random_state': [42], 'max_samples': [0.01, 0.5, 0.09]},
    {'n_estimators': [2, 10, 100, 1000], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 5, 10], 'max_features': ['sqrt', 'log2', None], 'bootstrap': [True], 'oob_score': [True, False], 'n_jobs': [-1], 'random_state': [42], 'max_samples': [0.01, 0.5, 0.09]}
]

In [31]:
param_grid = [
    {'n_estimators': [10], 'min_samples_split': [2], 'min_samples_leaf': [5], 'max_features': [None], 'bootstrap': [False], 'n_jobs': [-1], 'random_state': [42], 'max_samples': [0.5]}
]

In [32]:
ext_clf = ExtraTreesClassifier()
ext_grid_search = GridSearchCV(estimator=ext_clf, param_grid=param_grid, scoring=['accuracy', 'balanced_accuracy', 'f1'], n_jobs=-1, refit='accuracy', cv=5, return_train_score=True)

In [33]:
ext_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=ExtraTreesClassifier(), n_jobs=-1,
             param_grid=[{'bootstrap': [False], 'max_features': [None],
                          'max_samples': [0.5], 'min_samples_leaf': [5],
                          'min_samples_split': [2], 'n_estimators': [10],
                          'n_jobs': [-1], 'random_state': [42]}],
             refit='accuracy', return_train_score=True,
             scoring=['accuracy', 'balanced_accuracy', 'f1'])

In [23]:
cvres_grid_search = ext_grid_search.cv_results_
pd.DataFrame(cvres_grid_search)[:10]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_features,param_max_samples,param_min_samples_leaf,param_min_samples_split,param_n_estimators,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
0,0.035842,0.007772,0.022638,0.005836,False,sqrt,0.01,1,2,2,...,0.428745,0.098021,864,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,0.058388,0.009,0.016288,0.003961,False,sqrt,0.01,1,2,10,...,0.543205,0.073517,343,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,0.437997,0.017923,0.042817,0.002756,False,sqrt,0.01,1,2,100,...,0.563058,0.06349,217,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,7.528751,0.240522,0.89058,0.034091,False,sqrt,0.01,1,2,1000,...,0.565558,0.061481,191,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.146725,0.07778,0.007455,0.00038,False,sqrt,0.01,1,5,2,...,0.491077,0.057995,747,0.966392,0.963116,0.968842,0.967988,0.964948,0.966257,0.002063
5,0.129029,0.090469,0.012284,0.001006,False,sqrt,0.01,1,5,10,...,0.561852,0.064248,227,0.997681,0.998841,0.999132,0.998842,0.998843,0.998668,0.000506
6,0.387958,0.065177,0.054027,0.011887,False,sqrt,0.01,1,5,100,...,0.565363,0.06567,194,1.0,0.999711,0.999711,1.0,0.999711,0.999826,0.000142
7,8.06367,0.460066,0.884824,0.077372,False,sqrt,0.01,1,5,1000,...,0.55958,0.064639,240,0.999711,0.999711,1.0,1.0,1.0,0.999884,0.000142
8,0.040952,0.036384,0.014958,0.006641,False,sqrt,0.01,1,10,2,...,0.521351,0.056619,530,0.869697,0.877707,0.886267,0.876704,0.876289,0.877333,0.005287
9,0.039043,0.007607,0.01409,0.005064,False,sqrt,0.01,1,10,10,...,0.551261,0.072347,286,0.948153,0.950477,0.948901,0.951132,0.954789,0.95069,0.00231


In [24]:
ext_grid_search.best_params_

{'bootstrap': True,
 'max_features': None,
 'max_samples': 0.09,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 10,
 'n_jobs': -1,
 'oob_score': True,
 'random_state': 42}

In [25]:
ext_grid_search.best_score_

0.7211805555555555

In [27]:
# モデルの保存
joblib.dump(ext_grid_search.best_estimator_, './pkl/ext_best_estimator_gridsearch_211008_1.pkl')

['./pkl/ext_best_estimator_gridsearch_211008_1.pkl']

#### 検証

In [40]:
best_ext_clf = ext_grid_search.best_estimator_

In [41]:
best_ext_clf.score(X_test, y_test)

0.7451388888888889

In [34]:
# K分割交差検証
y_test_pred = cross_val_predict(best_ext_clf, X_test, y_test, cv=3)
res = confusion_matrix(y_test, y_test_pred)

In [31]:
f1_score(y_test, y_test_pred)

0.5886255924170616

In [42]:
# cross_validation
scoring = ['accuracy', 'balanced_accuracy', 'f1']
scores = cross_validate(best_ext_clf, X_test, y_test, scoring = scoring, cv=10, n_jobs=-1)

In [47]:
scores

{'fit_time': array([0.14454913, 0.15962386, 0.16170597, 0.14397693, 0.14302015,
        0.14372611, 0.15390587, 0.16117883, 0.15734506, 0.14432096]),
 'score_time': array([0.01057172, 0.010252  , 0.00922513, 0.01040578, 0.0111208 ,
        0.01080179, 0.01005793, 0.01041412, 0.00999999, 0.0106678 ]),
 'test_accuracy': array([0.59375   , 0.71180556, 0.76736111, 0.72395833, 0.76041667,
        0.69618056, 0.71701389, 0.73263889, 0.72916667, 0.68402778]),
 'test_balanced_accuracy': array([0.65833333, 0.71759259, 0.72222222, 0.64305556, 0.71018519,
        0.63472222, 0.71712963, 0.66759259, 0.69722222, 0.58703704]),
 'test_f1': array([0.62857143, 0.65843621, 0.63586957, 0.46464646, 0.61452514,
        0.48979592, 0.65539112, 0.53333333, 0.6119403 , 0.32089552])}

In [46]:
np.mean(scores['test_accuracy'])

0.7116319444444444

### Trial 2

In [16]:
# for ext_best_estimator_gridsearch_211012_1.pkl
param_grid = [
    {'n_estimators': [5, 10, 20, 30, 40, 50, 60], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [3, 5, 8], 'max_features': [None], 'bootstrap': [True], 'oob_score': [True],'n_jobs': [-1], 'random_state': [42], 'max_samples': [None, 0.09]}
]

# best params
# {'bootstrap': True,
#  'max_features': None,
#  'max_samples': 0.09,
#  'min_samples_leaf': 5,
#  'min_samples_split': 2,
#  'n_estimators': 10,
#  'n_jobs': -1,
#  'oob_score': True,
#  'random_state': 42}

In [25]:
# for ext_best_estimator_gridsearch_211012_2.pkl
param_grid = [
    {'n_estimators': [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 'min_samples_split': [2], 'min_samples_leaf': [4, 5, 6, 7], 'max_features': [None], 'bootstrap': [True], 'oob_score': [True],'n_jobs': [-1], 'random_state': [42], 'max_samples': [0.09]}
]

# best params
# {'bootstrap': True,
#  'max_features': None,
#  'max_samples': 0.09,
#  'min_samples_leaf': 5,
#  'min_samples_split': 2,
#  'n_estimators': 7,
#  'n_jobs': -1,
#  'oob_score': True,
#  'random_state': 42}

# result
# 0.7395833333333334
# array([[2707,  893],
#       [ 905, 1255]])
# f1: 0.5826369545032497

In [26]:
ext_clf = ExtraTreesClassifier()
ext_grid_search = GridSearchCV(estimator=ext_clf, param_grid=param_grid, scoring=['accuracy', 'balanced_accuracy', 'f1'], n_jobs=-1, refit='accuracy', cv=5)
ext_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=ExtraTreesClassifier(), n_jobs=-1,
             param_grid=[{'bootstrap': [True], 'max_features': [None],
                          'max_samples': [0.09],
                          'min_samples_leaf': [4, 5, 6, 7],
                          'min_samples_split': [2],
                          'n_estimators': [6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                           16],
                          'n_jobs': [-1], 'oob_score': [True],
                          'random_state': [42]}],
             refit='accuracy', scoring=['accuracy', 'balanced_accuracy', 'f1'])

In [27]:
ext_grid_search.best_params_

{'bootstrap': True,
 'max_features': None,
 'max_samples': 0.09,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 7,
 'n_jobs': -1,
 'oob_score': True,
 'random_state': 42}

In [28]:
joblib.dump(ext_grid_search.best_estimator_, './pkl/ext_best_estimator_gridsearch_211012_2.pkl')

['./pkl/ext_best_estimator_gridsearch_211012_2.pkl']

#### 検証

In [29]:
ext_clf = ext_grid_search.best_estimator_

In [30]:
ext_clf.score(X_test, y_test)

0.7395833333333334

In [31]:
y_test_pred = cross_val_predict(ext_clf, X_test, y_test, cv=3)
confusion_matrix(y_test, y_test_pred)

array([[2707,  893],
       [ 905, 1255]])

In [46]:
# 交差検証
scores = cross_val_score(ext_clf, X_test, y_test, scoring="accuracy", cv=10)
display_scores(scores)

Scores:  [0.59375    0.71875    0.75868056 0.72743056 0.75520833 0.70138889
 0.74131944 0.72048611 0.76909722 0.68923611]
Mean:  0.7175347222222224
Standard deviation:  0.04777965768669139


In [47]:
# 交差検証
scores = cross_val_score(ext_clf, X_test, y_test, scoring="accuracy", cv=3)
display_scores(scores)

Scores:  [0.63177083 0.71927083 0.7265625 ]
Mean:  0.6925347222222222
Standard deviation:  0.043069554175932466


In [32]:
f1_score(y_test, y_test_pred)

0.5826369545032497

## Random Forest

### 訓練

In [38]:
param_grid = [
    {'n_estimators': [2, 10, 100, 1000], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 5, 10], 'max_features': ['sqrt', 'log2', None], 'bootstrap': [False], 'n_jobs': [-1], 'random_state': [42], 'max_samples': [0.01, 0.5, 0.09]},
    {'n_estimators': [2, 10, 100, 1000], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 5, 10], 'max_features': ['sqrt', 'log2', None], 'bootstrap': [True], 'oob_score': [True, False], 'n_jobs': [-1], 'random_state': [42], 'max_samples': [0.01, 0.5, 0.09]}
]

In [39]:
rnd_clf = RandomForestClassifier()
rnd_grid_search = GridSearchCV(estimator=rnd_clf, param_grid=param_grid, scoring=['accuracy', 'balanced_accuracy', 'f1'], n_jobs=-1, refit='accuracy', cv=5, return_train_score=True)

In [None]:
rnd_grid_search.fit(X_train, y_train)

### 検証

## 特徴量の重要度

In [49]:
def RecFeaturesImportance(model, feature_attrbs):
  feature_score = {}
  for name, score in zip(feature_attrbs, model.feature_importances_):
    feature_score[name] = score
  
  # sort
  feature_score_sorted = sorted(feature_score.items(), key=lambda x:x[1], reverse=True)
  return feature_score_sorted

In [50]:
RecFeaturesImportance(best_ext_clf, feature_attribs)

[('high_power', 0.16054591945518026),
 ('diff_std', 0.1544123883534813),
 ('hlbr', 0.094466471580454),
 ('diff_auc', 0.06991451431884008),
 ('srmr', 0.04334347839668752),
 ('coe1[1]', 0.04144773282016976),
 ('coe1[0]', 0.03570185457401731),
 ('ac_auc', 0.027006805517937625),
 ('gp_auc_max', 0.02648309034749241),
 ('coe3[3]', 0.026413032759559318),
 ('gp_max_val_mean', 0.023999624715158117),
 ('coe3[2]', 0.02264709285304872),
 ('low_power', 0.019940151871501312),
 ('ratio_max_to_10ms_ave_peaks', 0.019565034637140297),
 ('gp_auc_std', 0.019050401149591854),
 ('gp_max_val_max', 0.01820756521598376),
 ('ratio_max_to_9th_ave_peaks', 0.018204867732130113),
 ('gp_auc_min', 0.016490738354835267),
 ('gp_max_val_std', 0.016380211187951313),
 ('gp_auc_mean', 0.016242079244455566),
 ('gp_max_val_min', 0.016188220486631856),
 ('gp_max_val_range', 0.015330202484472139),
 ('ac_std', 0.01512506717508883),
 ('gp_auc_range', 0.014512044407477928),
 ('tdoa_max', 0.0092535279515857),
 ('gp_max_ix_mean', 0

In [59]:
import datetime

d_today = datetime.date.today()
d_today.isoweekday()

2

In [62]:
d_today.isocalendar()

datetime.IsoCalendarDate(year=2021, week=43, weekday=2)

In [64]:
d_today.isoformat()

'2021-10-26'

In [65]:
d_today.timetuple()

time.struct_time(tm_year=2021, tm_mon=10, tm_mday=26, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=1, tm_yday=299, tm_isdst=-1)

In [67]:
type(datetime.date.today().isoformat())

str

In [71]:
dirs_list = os.listdir('../out/')
dirs_list

['features_s6.numbers',
 'features_s1_s4.numbers',
 'features_s7.numbers',
 '.DS_Store',
 'features_ch0_2.csv',
 'features_s5.numbers',
 'test.csv',
 'features_mono_ch_only_gccphat_211007.csv',
 'features_s4.numbers',
 'features_s10.numbers',
 'features_s8.numbers',
 'csv',
 'matlab',
 'features_s9.numbers',
 'features_mono_ch_only_gccphat_211021.csv',
 'features_s2.numbers',
 'features_mono_ch_211004.csv',
 'fearures_mono_ch_complete_211007.numbers',
 'features_s3.numbers',
 'features_ch0.numbers']

In [82]:
no = 0
for dir_name in dirs_list:
    is_match = re.match(r'^features.*', dir_name)
    #is_match = re.match(r'features.*', dir_name)
    if is_match != None:
        no += 1
        print(is_match)
        
no

<re.Match object; span=(0, 19), match='features_s6.numbers'>
<re.Match object; span=(0, 22), match='features_s1_s4.numbers'>
<re.Match object; span=(0, 19), match='features_s7.numbers'>
<re.Match object; span=(0, 18), match='features_ch0_2.csv'>
<re.Match object; span=(0, 19), match='features_s5.numbers'>
<re.Match object; span=(0, 40), match='features_mono_ch_only_gccphat_211007.csv'>
<re.Match object; span=(0, 19), match='features_s4.numbers'>
<re.Match object; span=(0, 20), match='features_s10.numbers'>
<re.Match object; span=(0, 19), match='features_s8.numbers'>
<re.Match object; span=(0, 19), match='features_s9.numbers'>
<re.Match object; span=(0, 40), match='features_mono_ch_only_gccphat_211021.csv'>
<re.Match object; span=(0, 19), match='features_s2.numbers'>
<re.Match object; span=(0, 27), match='features_mono_ch_211004.csv'>
<re.Match object; span=(0, 19), match='features_s3.numbers'>
<re.Match object; span=(0, 20), match='features_ch0.numbers'>


15

In [76]:
len(dirs_list)

20