In [14]:
from transformers import GPT2Tokenizer
# トークナイザーをロード
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [15]:
from datasets import load_dataset

# OpenWebTextデータセットのロード
dataset = load_dataset('openwebtext')

# 'train'セットから5000サンプルを取得
limited_dataset = dataset['train'].select(range(5000))

print(limited_dataset)

Dataset({
    features: ['text'],
    num_rows: 5000
})


In [16]:
context_length = 128

In [17]:
import torch
from transformers import GPT2LMHeadModel, AutoConfig


# GPT-2の設定を作成（小型モデル用）
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
) # "gpt2"を指定して小型モデルを取得
# モデルを初期化
model = GPT2LMHeadModel(config)

In [19]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [21]:
tokenizer.pad_token = tokenizer.eos_token

In [25]:
from tqdm import tqdm
import torch.optim as optim
import torch.nn.functional as F

# オプティマイザーの設定
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
model.train()  

num_epochs = 3  # エポック数
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    epoch_progress = tqdm(limited_dataset, desc="Batch Progress", leave=False)
    
    for example in epoch_progress:
        input_text = example['text']
        inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to('cuda')

        output = model(**inputs)
        logits = output.logits

        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), inputs["input_ids"].view(-1).to('cuda'))
        
        # 勾配計算とオプティマイザーのステップ
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 各バッチの損失を進捗バーに表示
        epoch_progress.set_postfix({"Loss": loss.item()})
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1/3


                                                                                  

Epoch 1/3, Loss: 0.0005087573081254959
Epoch 2/3


                                                                                 

KeyboardInterrupt: 

In [26]:
# 評価関数の定義
def evaluate_model(model, tokenizer, input_texts, device):
    model.eval()  # 評価モード
    total_loss = 0
    total_perplexity = 0
    count = 0
    with torch.no_grad():
        for text in input_texts:
            inputs = tokenizer(text, return_tensors='pt').to(device)
            outputs = model(**inputs, labels=inputs["input_ids"])  # 入力をモデルに渡す
            loss = outputs.loss.item()
            total_loss += loss
            perplexity = torch.exp(torch.tensor(loss))
            total_perplexity += perplexity.item()
            count += 1

    # 平均損失とパープレキシティを計算
    avg_loss = total_loss / count if count > 0 else float('inf')
    avg_perplexity = total_perplexity / count if count > 0 else float('inf')
    return avg_loss, avg_perplexity

# 使用例
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 評価データを定義
input_texts = ["This is a test sentence.", "How are you today?"]

# 教師モデルの評価
loss, perplexity = evaluate_model(model, tokenizer, input_texts, device)
print(f"Model - Loss: {loss:.4f}, Perplexity: {perplexity:.4f}")

Model - Loss: 22.4715, Perplexity: 7314218112.0000


In [30]:
import os

file_path = './distill_GPTlm_model/model.safetensors'
file_size = os.path.getsize(file_path)  # ファイルサイズをバイトで取得
print(f"File size: {file_size / (1024 * 1024):.2f} MB")  # MB単位で表示

File size: 474.71 MB
