# Preprocessing

In [2]:
import subprocess
subprocess.run(['jupyter', 'nbconvert', '--to', 'python', '1_preprocessing.ipynb'])

CompletedProcess(args=['jupyter', 'nbconvert', '--to', 'python', '1_preprocessing.ipynb'], returncode=0)

In [4]:
# '''
# データの処理に必要なツールをインポートする
# ''
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
import japanize_matplotlib

# DataFrameを表示する際の折り返す横幅の設定
pd.set_option('display.width', 200)

# DataFrameを表示する際、全てのDataFrame を表示させる設定
# デフォルトは一番最後のDataFrame一つのみ
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# DataFrameを表示するとき、表示する行数
# とりあえず最大５０００行表示するようにする
pd.set_option('display.max_rows', 5000)

# DataFrameを省略せず全部表示（多分ね）
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', 100)

In [108]:
right_arm_file_list = glob("train/right_arm/*")
df = pd.read_csv(right_arm_file_list[3])
df['timestamp'] = pd.to_datetime(df['timestamp'])
# df.dtypes
# df.sort_values('timestamp')


X                   float64
Y                   float64
Z                   float64
timestamp    datetime64[ns]
dtype: object

Unnamed: 0,X,Y,Z,timestamp
0,-0.0645,-0.0551,0.0036,1970-01-01 00:00:00.000000014
1,0.002,-0.035,-0.0057,1970-01-01 00:00:00.000000033
2,-0.0292,-0.0247,-0.0155,1970-01-01 00:00:00.000000054
3,-0.0471,-0.0541,0.0307,1970-01-01 00:00:00.000000073
4,-0.0172,-0.0542,-0.0279,1970-01-01 00:00:00.000000093
5,0.0018,-0.0451,-0.063,1970-01-01 00:00:00.000000113
6,-0.037,-0.0452,-0.0333,1970-01-01 00:00:00.000000133
7,0.0211,0.0035,-0.0311,1970-01-01 00:00:00.000000154
8,0.0208,-0.0652,-0.0006,1970-01-01 00:00:00.000000173
9,-0.0175,-0.0654,0.0091,1970-01-01 00:00:00.000000193


In [101]:
f = open('train/labels.txt')
label_file = f.read()  # ファイル終端まで全て読んだデータを返す
f.close()
# labels_list = [s.split(',') for s in label_file.split('\n')] # 改行で区切る(改行文字そのものは戻り値のデータには含まれない)
# labels = np.array(labels_list)
LABELS = pd.DataFrame([s.split(',', 2) for s in label_file.split('\n')], columns=['file_name', 'food', 'activity'])
LABELS = LABELS.dropna()

In [156]:
features_list = []
for i, file in enumerate(LABELS['file_name']):
    df = pd.read_csv('./train/right_arm/{0}.csv'.format(file))
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    feat = [LABELS.loc[i, 'activity'], df['X'].min(), df['X'].max(), df['X'].mean(), df['X'].max()-df['X'].min(),]
    features_list.append(feat)

FEATURES = pd.DataFrame(features_list)

## Takeにラベルつけ

In [254]:
take_file_list = LABELS[LABELS['activity'] == 'Take,']
# FEATURES['label'] = FEATURES[0]
FEATURES['label'] = '0'
# FEATURES.loc[FEATURES[0].str.contains('Take,'), 'label']='1'
FEATURES.loc[FEATURES[0] == 'Take,', 'label']='2'
FEATURES['label'].value_counts()

0    226
2     62
Name: label, dtype: int64

## classification

In [255]:
FEATURES.columns

Index([0, 1, 2, 3, 4, 'label'], dtype='object')

In [256]:
X_train,X_test,Y_train,Y_test = train_test_split(FEATURES[[1, 2, 3, 4]], FEATURES['label'], test_size=0.3, random_state=1)


In [260]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model_ml = RandomForestClassifier(n_estimators=4,n_jobs=-1, criterion= 'gini', max_depth=2, random_state=15)

model_ml.fit(X_train,Y_train)
Y_predict = model_ml.predict(X_test)

print(classification_report(Y_test,Y_predict))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=4, n_jobs=-1,
                       oob_score=False, random_state=15, verbose=0,
                       warm_start=False)

              precision    recall  f1-score   support

           0       0.91      0.83      0.87        76
           2       0.28      0.45      0.34        11

    accuracy                           0.78        87
   macro avg       0.60      0.64      0.61        87
weighted avg       0.83      0.78      0.80        87



In [None]:
%%time
from tqdm import tqdm
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

max_score = 0
SearchMethod = 0
RFC_grid = {RandomForestClassifier(): {"n_estimators": [i for i in range(1, 500)],
                                       "criterion": ["gini", "entropy"],
                                       "max_depth":[i for i in range(1, 500)],
                                       "random_state": [15],
                                      }}

#ランダムフォレストの実行
for model, param in tqdm(RFC_grid.items()):
    clf = GridSearchCV(model, param)
    clf.fit(X_train, Y_train)
    Y_predict = clf.predict(X_test)
    score = f1_score(Y_test, Y_predict, average="micro")

    if max_score < score:
        max_score = score
        best_param = clf.best_params_
        best_model = model.__class__.__name__

print("ベストスコア:{}".format(max_score))
print("モデル:{}".format(best_model))
print("パラメーター:{}".format(best_param))


  0%|          | 0/1 [00:00<?, ?it/s][A

In [None]:
LABELS['activity'].value_counts()