In [27]:
import torch
from torch import nn
from datasets import load_dataset

dataset = load_dataset("mteb/tweet_sentiment_extraction")

In [28]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [29]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [30]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [125]:
from torchinfo import summary

summary(custom_model, depth=6)

Layer (type:depth-idx)                             Param #
CustomGPT2ForSequenceClassification                --
├─CustomGPT2Model: 1-1                             --
│    └─Embedding: 2-1                              38,597,376
│    └─Embedding: 2-2                              786,432
│    └─Dropout: 2-3                                --
│    └─ModuleList: 2-4                             --
│    │    └─GPT2Block: 3-1                         --
│    │    │    └─LayerNorm: 4-1                    1,536
│    │    │    └─CustomAttention: 4-2              --
│    │    │    │    └─GPT2SdpaAttention: 5-1       --
│    │    │    │    │    └─Conv1D: 6-1             1,771,776
│    │    │    │    │    └─Conv1D: 6-2             590,592
│    │    │    │    │    └─Dropout: 6-3            --
│    │    │    │    │    └─Dropout: 6-4            --
│    │    │    │    └─Linear: 5-2                  590,592
│    │    │    └─LayerNorm: 4-3                    1,536
│    │    │    └─GPT2MLP: 4-4            

In [138]:
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel

class CustomAttention(nn.Module):
    def __init__(self, original_attention):
        super().__init__()
        self.original_attention = original_attention
        
        # Linear層を追加
        self.linear = nn.Linear(original_attention.embed_dim, original_attention.embed_dim)

    def forward(self, hidden_states, attention_mask=None, layer_past=None, head_mask=None, use_cache=None, output_attentions=None, ):
        # Linear層を適用
        linear_output = self.linear(hidden_states)

        # 元のAttentionの処理を呼び出す
        return self.original_attention(linear_output, attention_mask=attention_mask, layer_past=layer_past)

class CustomGPT2Model(GPT2Model):
    def __init__(self, config):
        super().__init__(config)
        
        # 各ブロックのAttentionをカスタマイズ
        for i, block in enumerate(self.h):  # self.hに直接アクセス
            block.attn = CustomAttention(block.attn)
    

class CustomGPT2ForSequenceClassification(GPT2LMHeadModel):
    def __init__(self, config, num_labels):
        super().__init__(config)
        self.num_labels = num_labels
        self.transformer = CustomGPT2Model(config)  # CustomGPT2Modelを初期化
        self.lm_head = nn.Linear(config.n_embd, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.transformer(input_ids, attention_mask=attention_mask)
        
        # 最後の隠れ層の出力を取得
        hidden_states = outputs.last_hidden_state[:, -1, :]
        
        
        # ロジットを計算
        logits = self.lm_head(hidden_states)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        else:
            print("label is not exist")
        return (loss, logits) if loss is not None else logits

# モデルの初期化
config = GPT2Config.from_pretrained("gpt2", num_labels=3)
custom_model = CustomGPT2ForSequenceClassification(config, num_labels=3)

In [139]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
   output_dir="test_trainer3",
   evaluation_strategy="epoch",
   per_device_train_batch_size=1,
   per_device_eval_batch_size=1,
   gradient_accumulation_steps=10
   )


trainer = Trainer(
   model=custom_model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
)



In [140]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.09294
2,No log,1.072426
3,No log,1.136135


TrainOutput(global_step=300, training_loss=1.05141357421875, metrics={'train_runtime': 354.246, 'train_samples_per_second': 8.469, 'train_steps_per_second': 0.847, 'total_flos': 1698424215552000.0, 'train_loss': 1.05141357421875, 'epoch': 3.0})

In [None]:
trainer.train()

In [141]:
import evaluate

trainer.evaluate()

{'eval_loss': 1.1361347436904907,
 'eval_runtime': 28.3736,
 'eval_samples_per_second': 35.244,
 'eval_steps_per_second': 35.244,
 'epoch': 3.0}