ここではtrainデータ全体からランダムにデータを取る。  
データ数を増やすことで性能向上することを示すには全体の訓練データからランダムにデータを取ってくる必要があると考えた。

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertModel, BertConfig
import seaborn as sns
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

#import utils

# For reproducibility
np.random.seed(42)
torch.manual_seed(42) # 乱数生成シード
cudnn.benchmark = True

# Grab a GPU if there is one
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using {} device: {}".format(device, torch.cuda.current_device()))
else:
    device = torch.device("cpu")
    print("Using {}".format(device))

Using cuda device: 0


In [2]:
fold = "../../train_raw_npy/"
acc_x = np.loadtxt("../../train_raw/Acc_x.txt")
label = np.load(f"{fold}sampled_label.npy")

print(acc_x.shape)
print(label.shape)

(196072, 500)
(196072,)


In [4]:
# ビンの数を設定
num_bins = 30000  

# ビンの境界を等頻度で設定
bins = np.percentile(acc_x, np.linspace(0, 100, num_bins + 1))

# データをビンに分割
discrete_data = np.digitize(acc_x, bins) - 1  # ビンのインデックスを取得し、0から始まるように調整

# 予約トークンを避けるためにシフト
discrete_data += 104

# 最大インデックスを確認
max_index = discrete_data.max()
print(f"Max index after shifting: {max_index}")

# vocab_sizeを確認（シフト後の最大値を考慮）
vocab_size = max_index + 1
print(f"Vocab size: {vocab_size}")

# 確認のための一部データ
print("Sample discrete data:", discrete_data[:10])
print("Discrete data range:", np.min(discrete_data), "to", np.max(discrete_data))

"""# 分布の視覚化
plt.figure(figsize=(12, 6))
sns.histplot(discrete_data.flatten(), bins=num_bins, kde=False)
plt.title(f"Distribution of Discretized Accelerometer X-axis Data ({num_bins} bins, equal-frequency)")
plt.xlabel("Discrete Value")
plt.ylabel("Frequency")
plt.show()"""

# 離散化されたデータを保存
#np.save("train_token_ids_rebinned.npy", discrete_data)

Max index after shifting: 30104
Vocab size: 30105
Sample discrete data: [[24029 24034 24075 ...  6255  7731  8392]
 [ 8839  8946  7772 ...  9095  9134  8994]
 [ 8474  7847  7012 ... 25182 25115 25278]
 ...
 [16043 14565 19002 ... 26111 21273  9221]
 [ 3849 12370 16115 ... 23046 24751 23873]
 [12795 12362 21437 ...  7500 12527 17853]]
Discrete data range: 104 to 30104


'# 分布の視覚化\nplt.figure(figsize=(12, 6))\nsns.histplot(discrete_data.flatten(), bins=num_bins, kde=False)\nplt.title(f"Distribution of Discretized Accelerometer X-axis Data ({num_bins} bins, equal-frequency)")\nplt.xlabel("Discrete Value")\nplt.ylabel("Frequency")\nplt.show()'

In [6]:
discrete_data.shape

(196072, 500)

In [7]:
#説明変数
discrete_data
#目的変数
label

# ランダム３万データを使う  
size = 30000
sample_indices = np.random.choice(discrete_data.shape[0], size=size, replace=False)
X_30000 = discrete_data[sample_indices, :]
label_30000 = label[sample_indices]
print(X_30000.shape)
print(label_30000.shape)

(30000, 500)
(30000,)


In [8]:
print(X_30000.dtype)
print(label_30000.dtype)
X_30000 =X_30000.astype(np.int32)
label_30000 = label_30000.astype(np.int32)
print(X_30000.dtype)
print(label_30000.dtype)

int64
float64
int32
int32


In [9]:
# ランダムサンプリング
train_x, test_x, train_label, test_label = train_test_split(X_30000, label_30000, test_size=0.2,random_state=40)

# 結果の確認
print(f"Train data shape: {train_x.shape}")
print(f"Test data shape: {test_x.shape}")
print(f"Train label shape: {train_label.shape}")
print(f"Test label shape: {test_label.shape}")
print(pd.Series(train_label).value_counts())
print(pd.Series(test_label).value_counts())

Train data shape: (24000, 500)
Test data shape: (6000, 500)
Train label shape: (24000,)
Test label shape: (6000,)
5    3907
7    3907
6    3490
1    3023
2    2969
8    2935
4    2804
3     965
Name: count, dtype: int64
5    1026
7     972
6     853
2     765
4     725
1     708
8     706
3     245
Name: count, dtype: int64


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class SensorDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.data[idx], dtype=torch.long)
        labels = torch.tensor(self.labels[idx], dtype=torch.long)

        # [CLS] と [SEP] トークンを追加
        input_ids = torch.cat([torch.tensor([101]), input_ids, torch.tensor([102])])

        attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [11]:
# ラベルを0~7クラス分類とする
print(train_label)
train_label -=1
print(train_label)
print(test_label)
test_label -=1
print(test_label)

[6 5 6 ... 2 5 5]
[5 4 5 ... 1 4 4]
[5 5 5 ... 2 7 5]
[4 4 4 ... 1 6 4]


In [12]:
# データセットとデータローダの作成
train_dataset = SensorDataset(train_x, train_label)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
eval_dataset = SensorDataset(test_x, test_label)
eval_dataloader = DataLoader(eval_dataset)

In [13]:
#DistilBERT
from transformers import DistilBertForSequenceClassification, DistilBertConfig

config = DistilBertConfig(
    vocab_size=vocab_size,
    num_labels=8,
)
model = DistilBertForSequenceClassification(config)

print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30105, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [14]:
# トレーニング設定
training_args = TrainingArguments(
    output_dir="./results_distilBERT",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=0,
    save_total_limit=None,
    #logging_dir = "./logs_distilBERT",
    evaluation_strategy="steps",
    eval_steps=1_000,
    logging_steps=100,
)

# トレーナーの定義
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# トレーニングの実行
trainer.train()



  0%|          | 0/4500 [00:00<?, ?it/s]

{'loss': 2.0721, 'grad_norm': 3.646036148071289, 'learning_rate': 4.888888888888889e-05, 'epoch': 0.07}
{'loss': 2.0535, 'grad_norm': 4.366201400756836, 'learning_rate': 4.7777777777777784e-05, 'epoch': 0.13}
{'loss': 2.0222, 'grad_norm': 2.8405861854553223, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.2}
{'loss': 1.9047, 'grad_norm': 4.834967613220215, 'learning_rate': 4.555555555555556e-05, 'epoch': 0.27}
{'loss': 1.7901, 'grad_norm': 5.8292036056518555, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.33}
{'loss': 1.7307, 'grad_norm': 6.301307678222656, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}
{'loss': 1.6618, 'grad_norm': 6.74536657333374, 'learning_rate': 4.222222222222222e-05, 'epoch': 0.47}
{'loss': 1.6556, 'grad_norm': 4.097332954406738, 'learning_rate': 4.111111111111111e-05, 'epoch': 0.53}
{'loss': 1.5643, 'grad_norm': 5.949358940124512, 'learning_rate': 4e-05, 'epoch': 0.6}
{'loss': 1.6082, 'grad_norm': 6.374051094055176, 'learning_rate': 3.88888888888

  0%|          | 0/750 [00:00<?, ?it/s]

{'eval_loss': 1.5308200120925903, 'eval_runtime': 27.2628, 'eval_samples_per_second': 220.08, 'eval_steps_per_second': 27.51, 'epoch': 0.67}
{'loss': 1.5946, 'grad_norm': 8.300867080688477, 'learning_rate': 3.777777777777778e-05, 'epoch': 0.73}
{'loss': 1.5509, 'grad_norm': 6.908094882965088, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}
{'loss': 1.5764, 'grad_norm': 5.093883514404297, 'learning_rate': 3.555555555555556e-05, 'epoch': 0.87}
{'loss': 1.5281, 'grad_norm': 4.997161865234375, 'learning_rate': 3.444444444444445e-05, 'epoch': 0.93}
{'loss': 1.5518, 'grad_norm': 5.65903377532959, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}
{'loss': 1.3325, 'grad_norm': 6.216193199157715, 'learning_rate': 3.222222222222223e-05, 'epoch': 1.07}
{'loss': 1.4049, 'grad_norm': 4.97233247756958, 'learning_rate': 3.111111111111111e-05, 'epoch': 1.13}
{'loss': 1.3537, 'grad_norm': 9.118179321289062, 'learning_rate': 3e-05, 'epoch': 1.2}
{'loss': 1.4196, 'grad_norm': 6.85159015655517

  0%|          | 0/750 [00:00<?, ?it/s]

{'eval_loss': 1.495347023010254, 'eval_runtime': 26.6821, 'eval_samples_per_second': 224.87, 'eval_steps_per_second': 28.109, 'epoch': 1.33}


In [None]:
del model
del train_dataset
del trainer
del eval_dataset

import gc
gc.collect()
torch.cuda.empty_cache()

# 評価  
学習のデータ数だけ変更して性能比較

### 語彙数30105  バッチサイズ16 あとデフォルト

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

final_evaluation = trainer.evaluate()

# ロスの履歴を取得
train_loss = []
train_steps = []
eval_loss = []
eval_steps = []
for log in trainer.state.log_history:
    if "loss" in log.keys():
        train_loss.append(log["loss"])
        train_steps.append(log["step"])
    if "eval_loss" in log.keys():
        eval_loss.append(log["eval_loss"])
        eval_steps.append(log["step"])

# 最後の評価結果を追加
eval_loss.append(final_evaluation["eval_loss"])
eval_steps.append(trainer.state.global_step)

# ロスのプロット
plt.figure(figsize=(10, 5))
plt.plot(train_steps, train_loss, label='Train Loss', color='C0')
plt.plot(eval_steps, eval_loss, label='Eval Loss', color='C1')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Evaluation Loss')
plt.grid(True)
plt.show()

predictions = trainer.predict(eval_dataset)
preds = np.argmax(predictions.predictions, axis=1)
#true_labels = eval_dataset["labels"]
# 分類レポートの生成
report = classification_report(test_label, preds, target_names=[str(i) for i in range(1, 9)])
print(report)

# 混同行列の計算
cm = confusion_matrix(test_label, preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[str(i) for i in range(1,9)], yticklabels=[str(i) for i in range(1,9)])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()