In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForCausalLM, AutoTokenizer
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

# OpenWebTextデータセットのロード
dataset = load_dataset('openwebtext')


Downloading data: 100%|██████████| 21/21 [02:34<00:00,  7.34s/files]
Generating train split: 100%|██████████| 8013769/8013769 [18:24<00:00, 7255.80 examples/s]


In [11]:
from datasets import load_dataset

# OpenWebTextデータセットを読み込む
dataset = load_dataset('openwebtext')

# 'train'セットから5000サンプルを取得
limited_dataset = dataset['train'].select(range(5000))

# データの確認
print(limited_dataset)


Dataset({
    features: ['text'],
    num_rows: 5000
})


In [3]:
from torch.utils.data import DataLoader

# バッチサイズを指定
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [4]:

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# デバイスの設定
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# トークナイザーと教師モデルをロード
teacher_model = GPT2LMHeadModel.from_pretrained('gpt2')
student_model = GPT2LMHeadModel.from_pretrained('gpt2')
teacher_model.to(device)  # GPUを使用する場合
student_model.to(device)

# トークナイザーをロード
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [8]:
def distillation_loss(student_logits, teacher_logits, temperature=2.0):
    teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
    student_probs = F.log_softmax(student_logits / temperature, dim=-1)
    return F.kl_div(student_probs, teacher_probs) * (temperature ** 2)

In [None]:
from tqdm import tqdm
import torch.optim as optim

# オプティマイザーの設定
optimizer = optim.AdamW(student_model.parameters(), lr=5e-5)

num_epochs = 3  # エポック数
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    epoch_progress = tqdm(limited_dataset, desc="Batch Progress", leave=False)
    
    for example in epoch_progress:
        input_text = example['text']
        inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to('cuda')

        # 教師モデルの出力を取得
        with torch.no_grad():
            teacher_output = teacher_model(**inputs)
            teacher_logits = teacher_output.logits

        # 生徒モデルの出力を取得
        student_output = student_model(**inputs)
        student_logits = student_output.logits

        # 蒸留損失の計算
        loss = distillation_loss(student_logits, teacher_logits)
        
        # 勾配計算とオプティマイザーのステップ
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 各バッチの損失を進捗バーに表示
        epoch_progress.set_postfix({"Loss": loss.item()})
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1/3


                                                                                 

Epoch 1/3, Loss: 5.995044460860299e-08
Epoch 2/3


                                                                                 

Epoch 2/3, Loss: 1.1594470095133147e-07
Epoch 3/3


                                                                                 

Epoch 3/3, Loss: 1.8289404124516295e-07




In [None]:
# トレーニング後のモデルを保存
student_model.save_pretrained('mistral_distilled_model')
student_tokenizer.save_pretrained('mistral_distilled_model')