transformerベースのモデルを定義してみる

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertModel, BertConfig

import utils

# For reproducibility
np.random.seed(42)
torch.manual_seed(42) # 乱数生成シード
cudnn.benchmark = True

# Grab a GPU if there is one
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using {} device: {}".format(device, torch.cuda.current_device()))
else:
    device = torch.device("cpu")
    print("Using {}".format(device))

Using cuda device: 0


In [None]:
class SensorDataTransformer(nn.Module):
    def __init__(self, input_dim, num_classes, transformer_model='bert-base-uncased'):
        super(SensorDataTransformer, self).__init__()
        self.config = BertConfig.from_pretrained(transformer_model)
        self.config.hidden_size = input_dim
        self.transformer = BertModel(self.config)
        self.fc = nn.Linear(self.config.hidden_size, num_classes)
    
    def forward(self, x):
        # Transformer expects inputs of shape (batch_size, seq_length, input_dim)
        # Here we need to rearrange the input to match BERT's expected input shape
        x = x.permute(1, 0, 2)  # shape: (seq_length, batch_size, input_dim)
        outputs = self.transformer(inputs_embeds=x)
        pooled_output = outputs.pooler_output  # shape: (batch_size, hidden_size)
        return self.fc(pooled_output)

In [3]:
model = BertModel.from_pretrained("bert-base-uncased")

In [4]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [7]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class SensorTransformer(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim):
        super(SensorTransformer, self).__init__()
        self.input_linear = nn.Linear(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim)
        encoder_layers = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.fc_out = nn.Linear(model_dim, output_dim)

    def forward(self, x):
        # x shape: (batch_size, seq_len, input_dim)
        x = self.input_linear(x)  # (batch_size, seq_len, model_dim)
        x = self.pos_encoder(x.permute(1, 0, 2))  # (seq_len, batch_size, model_dim)
        x = self.transformer_encoder(x)  # (seq_len, batch_size, model_dim)
        x = x.permute(1, 0, 2)  # (batch_size, seq_len, model_dim)
        x = x[:, -1, :]  # (batch_size, model_dim)
        x = self.fc_out(x)  # (batch_size, output_dim)
        return x

# モデルのインスタンス化
input_dim = 10
model_dim = 64
num_heads = 8
num_layers = 4
output_dim = 1  # 例えば、回帰タスクの場合

model = SensorTransformer(input_dim, model_dim, num_heads, num_layers, output_dim)

# ダミーセンサーデータの作成（バッチサイズ=32、シーケンス長=50、特徴数=10）
sensor_data = torch.randn(32, 50, 10)

# フォワードパスの実行
output = model(sensor_data)
print(f'Output shape: {output.shape}')


Output shape: torch.Size([32, 1])




In [8]:
print(model)

SensorTransformer(
  (input_linear): Linear(in_features=10, out_features=64, bias=True)
  (pos_encoder): PositionalEncoding()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc_out): Linear(in_features=64, out_features=1, bias=True)
)
