ここではxyzのデータを使って学習、評価  
個人的には0~180000のデータをランダムに180000~190000を時系列でテストデータにするといいと思う。

In [1]:
import os
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import manifold
from sklearn import metrics
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import urllib.request
import shutil
import zipfile
from scipy.stats import mode

# 訓練データとテストデータの確保

In [4]:
fold = "../train_raw_npy/"
xyz = np.load(f"{fold}acc_xyz.npy")
label = np.load(f"{fold}sampled_label.npy")
print(xyz.shape)
print(label.shape)


(196072, 3, 500)
(196072,)


In [11]:
# train/test split
pin = 170000
train_xyz = xyz[0:pin, :, :]
train_label = label[0:pin]
test_xyz = xyz[pin:190000, :, :]
test_label = label[pin:190000]
print(train_xyz.shape)
print(test_xyz.shape)

(170000, 3, 500)
(20000, 3, 500)


In [13]:
print(pd.Series(train_label).value_counts())
print(pd.Series(test_label).value_counts())

7.0    27641
5.0    26056
6.0    25689
8.0    22016
2.0    21185
1.0    20712
4.0    19603
3.0     7098
Name: count, dtype: int64
7.0    3612
1.0    3315
4.0    2766
6.0    2637
2.0    2489
5.0    2419
8.0    1818
3.0     944
Name: count, dtype: int64


In [15]:
# trainをランダムに20000個サンプリングする。
sample_size = 20000
indices = np.random.choice(train_xyz.shape[0], sample_size, replace=False)
train_xyz_random = train_xyz[indices]
train_label_random = train_label[indices]


In [16]:
print(train_xyz_random.shape)
print(train_label_random.shape)
print(test_xyz.shape)
print(test_label.shape)

(20000, 3, 500)
(20000,)
(20000, 3, 500)
(20000,)


In [19]:
# trainとtestのxyzを２次元配列にする
train_xyz_random_reshape = train_xyz_random.reshape(train_xyz_random.shape[0], -1)
test_xyz_reshape = test_xyz.reshape(test_xyz.shape[0], -1)
print(train_xyz_random_reshape.shape)
print(train_label_random.shape)
print(test_xyz_reshape.shape)
print(test_label.shape)
print(np.unique(train_label_random))
print(np.unique(test_label))

(20000, 1500)
(20000,)
(20000, 1500)
(20000,)
[1. 2. 3. 4. 5. 6. 7. 8.]
[1. 2. 3. 4. 5. 6. 7. 8.]


# 学習  
アンサンブル学習

In [20]:
clf = BalancedRandomForestClassifier(
    n_estimators=2000,
    replacement=True,
    sampling_strategy='not minority',
    oob_score=True,
    n_jobs=6,
    random_state=42,
    verbose=1
)
clf.fit(train_xyz_random_reshape, train_label_random)
#clf.fit(sampled_x_random,sampled_label_random) # 訓練データを使ってまずランダムフォレストを学習している。

  warn(
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    4.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   21.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   54.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:  2.6min
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:  3.8min
[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:  4.2min finished


In [21]:
Y_test_pred = clf.predict(test_xyz_reshape) # 学習器の性能をみる テストデータで
print('\nClassifier performance')
print('Out of sample:\n', metrics.classification_report(test_label, Y_test_pred, zero_division=0))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    2.3s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    3.3s



Classifier performance
Out of sample:
               precision    recall  f1-score   support

         1.0       0.40      0.27      0.32      3315
         2.0       0.73      0.64      0.68      2489
         3.0       0.62      0.99      0.76       944
         4.0       0.72      0.77      0.74      2766
         5.0       0.46      0.43      0.45      2419
         6.0       0.23      0.33      0.27      2637
         7.0       0.27      0.20      0.23      3612
         8.0       0.12      0.17      0.14      1818

    accuracy                           0.42     20000
   macro avg       0.44      0.47      0.45     20000
weighted avg       0.43      0.42      0.42     20000



[Parallel(n_jobs=8)]: Done 2000 out of 2000 | elapsed:    3.7s finished


# スムージングも試してみる

In [22]:
import scipy.stats as stats

def mode(alist):
    m,c = stats.mode(alist) # mは最頻値、cは最頻値の出現回数
    m,c = m.item(), c.item()
    if c == 1:
        return alist[len(alist)//2] # つまり最頻値が決められない場合真ん中の値をとるということ
    return m

def rolling_mode(y, window_size=1500): # ラベルy
    y_dtype_origin = y.dtype
    y = pd.Series(y, dtype='category')

    y_code_smooth = y.cat.codes.rolling(window_size).apply(mode,raw=True)

    #NaNを元の値で埋める
    y_code_smooth = y_code_smooth.fillna(y.cat.codes)

    # intに変換
    y_code_smooth = y_code_smooth.astype('int')
    
    y_smooth = pd.Categorical.from_codes(y_code_smooth, dtype=y.dtype)
    y_smooth = np.asarray(y_smooth, dtype=y_dtype_origin)
    return y_smooth

In [35]:
Y_test_pred_smooth = []
Y_test_pred_smooth.append(rolling_mode(Y_test_pred,window_size=20))
Y_test_pred_smooth = np.concatenate(Y_test_pred_smooth)

以下は最頻値を使ったスムージングを行った例

In [36]:
print('\nClassifier performance -- mode smoothing')
print('Out of sample:\n', metrics.classification_report(test_label, Y_test_pred))
print('Out of sample:\n', metrics.classification_report(test_label, Y_test_pred_smooth))


Classifier performance -- mode smoothing
Out of sample:
               precision    recall  f1-score   support

         1.0       0.40      0.27      0.32      3315
         2.0       0.73      0.64      0.68      2489
         3.0       0.62      0.99      0.76       944
         4.0       0.72      0.77      0.74      2766
         5.0       0.46      0.43      0.45      2419
         6.0       0.23      0.33      0.27      2637
         7.0       0.27      0.20      0.23      3612
         8.0       0.12      0.17      0.14      1818

    accuracy                           0.42     20000
   macro avg       0.44      0.47      0.45     20000
weighted avg       0.43      0.42      0.42     20000

Out of sample:
               precision    recall  f1-score   support

         1.0       0.48      0.28      0.35      3315
         2.0       0.85      0.72      0.78      2489
         3.0       0.69      0.95      0.80       944
         4.0       0.79      0.86      0.82      2766
    

# 以下は訓練データの量を２倍にして学習

In [38]:
# trainをランダムに40000個サンプリングする
sample_size = 40000
indices = np.random.choice(train_xyz.shape[0], sample_size, replace=False)
train_xyz_random = train_xyz[indices]
train_label_random = train_label[indices]

In [39]:
print(train_xyz_random.shape)
print(train_label_random.shape)
print(test_xyz.shape)
print(test_label.shape)

(40000, 3, 500)
(40000,)
(20000, 3, 500)
(20000,)


In [40]:
# trainとtestのxyzを２次元配列にする
train_xyz_random_reshape = train_xyz_random.reshape(train_xyz_random.shape[0], -1)
test_xyz_reshape = test_xyz.reshape(test_xyz.shape[0], -1)
print(train_xyz_random_reshape.shape)
print(train_label_random.shape)
print(test_xyz_reshape.shape)
print(test_label.shape)
print(np.unique(train_label_random))
print(np.unique(test_label))

(40000, 1500)
(40000,)
(20000, 1500)
(20000,)
[1. 2. 3. 4. 5. 6. 7. 8.]
[1. 2. 3. 4. 5. 6. 7. 8.]


# 学習
アンサンブル学習訓練データ40000データで学習

In [41]:
clf = BalancedRandomForestClassifier(
    n_estimators=2000,
    replacement=True,
    sampling_strategy='not minority',
    oob_score=True,
    n_jobs=6,
    random_state=42,
    verbose=1
)
clf.fit(train_xyz_random_reshape, train_label_random)
#clf.fit(sampled_x_random,sampled_label_random) # 訓練データを使ってまずランダムフォレストを学習している。

  warn(
[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   12.7s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   57.2s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  2.2min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  3.9min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  6.2min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:  9.0min
[Parallel(n_jobs=6)]: Done 2000 out of 2000 | elapsed: 10.1min finished


In [42]:
Y_test_pred = clf.predict(test_xyz_reshape) # 学習器の性能をみる テストデータで
print('\nClassifier performance')
print('Out of sample:\n', metrics.classification_report(test_label, Y_test_pred, zero_division=0))

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.0s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    0.4s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    1.0s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:    1.8s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:    2.9s
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:    4.2s



Classifier performance
Out of sample:
               precision    recall  f1-score   support

         1.0       0.34      0.23      0.28      3315
         2.0       0.76      0.69      0.72      2489
         3.0       0.73      0.99      0.84       944
         4.0       0.76      0.83      0.79      2766
         5.0       0.50      0.45      0.47      2419
         6.0       0.26      0.39      0.31      2637
         7.0       0.28      0.20      0.23      3612
         8.0       0.13      0.18      0.15      1818

    accuracy                           0.44     20000
   macro avg       0.47      0.49      0.47     20000
weighted avg       0.45      0.44      0.44     20000



[Parallel(n_jobs=6)]: Done 2000 out of 2000 | elapsed:    4.7s finished


In [43]:
Y_test_pred_smooth = []
Y_test_pred_smooth.append(rolling_mode(Y_test_pred,window_size=20))
Y_test_pred_smooth = np.concatenate(Y_test_pred_smooth)

同じくスムージング

In [44]:
print('\nClassifier performance -- mode smoothing')
print('Out of sample:\n', metrics.classification_report(test_label, Y_test_pred))
print('Out of sample:\n', metrics.classification_report(test_label, Y_test_pred_smooth))


Classifier performance -- mode smoothing
Out of sample:
               precision    recall  f1-score   support

         1.0       0.34      0.23      0.28      3315
         2.0       0.76      0.69      0.72      2489
         3.0       0.73      0.99      0.84       944
         4.0       0.76      0.83      0.79      2766
         5.0       0.50      0.45      0.47      2419
         6.0       0.26      0.39      0.31      2637
         7.0       0.28      0.20      0.23      3612
         8.0       0.13      0.18      0.15      1818

    accuracy                           0.44     20000
   macro avg       0.47      0.49      0.47     20000
weighted avg       0.45      0.44      0.44     20000

Out of sample:
               precision    recall  f1-score   support

         1.0       0.40      0.26      0.31      3315
         2.0       0.85      0.75      0.80      2489
         3.0       0.85      0.95      0.90       944
         4.0       0.83      0.92      0.87      2766
    