In [10]:
import torch
from torch import nn
from datasets import load_dataset

dataset = load_dataset("mteb/tweet_sentiment_extraction")

In [11]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [12]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [13]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [14]:
class CombinedModel(torch.nn.Module):
    def __init__(self, gpt2_model):
        super(CombinedModel, self).__init__()
        self.gpt2_model = gpt2_model
        
    def forward(self, input_ids, attention_mask=None, labels=None):
        # GPT-2の出力を取得
        outputs = self.gpt2_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

In [27]:
from torchinfo import summary

summary(model, depth=6)

Layer (type:depth-idx)                             Param #
GPT2ForSequenceClassification                      --
├─GPT2Model: 1-1                                   --
│    └─Embedding: 2-1                              38,597,376
│    └─Embedding: 2-2                              786,432
│    └─Dropout: 2-3                                --
│    └─ModuleList: 2-4                             --
│    │    └─GPT2Block: 3-1                         --
│    │    │    └─LayerNorm: 4-1                    1,536
│    │    │    └─GPT2SdpaAttention: 4-2            --
│    │    │    │    └─Conv1D: 5-1                  1,771,776
│    │    │    │    └─Conv1D: 5-2                  590,592
│    │    │    │    └─Dropout: 5-3                 --
│    │    │    │    └─Dropout: 5-4                 --
│    │    │    └─LayerNorm: 4-3                    1,536
│    │    │    └─GPT2MLP: 4-4                      --
│    │    │    │    └─Conv1D: 5-5                  2,362,368
│    │    │    │    └─Conv1D: 5-6      

In [15]:
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)
combined_model = CombinedModel(model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
print(combined_model)

CombinedModel(
  (gpt2_model): GPT2ForSequenceClassification(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2SdpaAttention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (score): Linear(in_feat

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
   output_dir="test_trainer3",
   evaluation_strategy="epoch",
   per_device_train_batch_size=1,
   per_device_eval_batch_size=1,
   gradient_accumulation_steps=10
   )


trainer = Trainer(
   model=combined_model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
)



In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.930751
2,No log,0.738174
3,No log,0.684408


TrainOutput(global_step=300, training_loss=0.7629056803385417, metrics={'train_runtime': 307.6741, 'train_samples_per_second': 9.751, 'train_steps_per_second': 0.975, 'total_flos': 0.0, 'train_loss': 0.7629056803385417, 'epoch': 3.0})

In [22]:
import evaluate

trainer.evaluate()

{'eval_loss': 0.6844080686569214,
 'eval_runtime': 24.6387,
 'eval_samples_per_second': 40.587,
 'eval_steps_per_second': 40.587,
 'epoch': 3.0}