In [6]:
from datasets import load_dataset
ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")

In [7]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
device='cuda'
# モデルの準備
teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [8]:
train_dataset=ds["train"].shuffle(seed=42).select(range(20000))
validation_dataset=ds["validation"].shuffle(seed=42).select(range(3500))
train_dataset = train_dataset["text"]
train_dataset = [item for item in train_dataset if item != '' and len(item) >= 50 and '@' not in item]
validation_dataset=validation_dataset["text"]
validation_dataset = [item for item in validation_dataset if item != '' and len(item) >= 50 and '@' not in item]

import re

train_dataset = [re.sub(r'[^a-zA-Z ]', '', item) for item in train_dataset]
train_dataset = [re.sub(r'\s+', ' ', item) for item in train_dataset]
validation_dataset = [re.sub(r'[^a-zA-Z ]', '', item) for item in validation_dataset]
validation_dataset = [re.sub(r'\s+', ' ', item) for item in validation_dataset]
train_dataset[0]

' The Mesozoic era is represented in the park by the model rock exposure showing a succession of beds namely the Jurassic and Cretaceous by models of dinosaurs and other animals known from mesozoic fossils and by suitable vegetation both living plants and models '

In [26]:
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader


# 入力とラベルを設定
train_data = []
for text in tqdm(train_dataset, desc="Tokenizing dataset"):
    tokenized = tokenizer(text, padding="max_length", max_length=32, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    attention_mask = tokenized['attention_mask'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    labels[-1]=-100
    train_data.append({"input_ids": input_ids, "labels": labels, "attention_mask":attention_mask})


Tokenizing dataset: 100%|██████████| 4063/4063 [00:02<00:00, 2020.84it/s]


In [27]:
import torch
input_ids = [item["input_ids"] for item in train_data]
labels = [item["labels"] for item in train_data]
attention_mask = [item["attention_mask"] for item in train_data]

In [29]:
batch_train=[]
for i in range(250):
    batch_input=[input_ids[i+0], input_ids[i+1], input_ids[i+2], input_ids[i+3]]
    batch_train.append(batch_input)
input_ids=batch_train

batch_train=[]
for i in range(250):
    batch_input=[labels[i+0], labels[i+1], labels[i+2], labels[i+3]]
    batch_train.append(batch_input)
labels=batch_train

batch_train=[]
for i in range(250):
    batch_input=[attention_mask[i+0], attention_mask[i+1], attention_mask[i+2], attention_mask[i+3]]
    batch_train.append(batch_input)
attention_mask=batch_train

In [30]:
input_ids_tensor = torch.tensor(batch_train, dtype=torch.long)
labels_tensor = torch.tensor(labels, dtype=torch.long)
attention_mask_tensor = torch.tensor(attention_mask, dtype=torch.long)

In [31]:
from torch.optim import AdamW

# オプティマイザの定義
optimizer = AdamW(teacher_model.parameters(), lr=5e-5)


In [32]:
input_ids_tensor=input_ids_tensor.to(device)
labels_tensor=labels_tensor.to(device)
attention_mask_tensor = attention_mask_tensor.to(device)
teacher_model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [35]:
print(input_ids[0])
print(labels[0])
print(attention_mask[0])

[[128000, 578, 36684, 96614, 292, 11639, 374, 15609, 304, 279, 6246, 555, 279, 1646, 7091, 14675, 9204, 264, 50787, 315, 28036, 32125, 279, 84474, 323, 356, 2171, 77140, 555, 4211, 315, 65375], [128000, 5929, 14588, 374, 264, 49770, 7701, 65765, 1370, 481, 922, 7102, 4398, 430, 4860, 3508, 27052, 374, 264, 2917, 481, 10723, 17563, 477, 9687, 7865, 477, 422, 433, 374, 459, 13365], [128000, 59895, 2214, 10554, 4562, 29607, 56872, 6267, 279, 8857, 315, 279, 5597, 304, 279, 71085, 1162, 56872, 29786, 304, 459, 7274, 449, 578, 1561, 4356, 8691, 1102, 574, 8196, 430, 420], [128000, 16450, 10357, 321, 5676, 3468, 46965, 386, 8512, 8384, 648, 274, 6691, 374, 264, 15779, 889, 374, 439, 459, 9191, 289, 12329, 810, 6940, 349, 2373, 315, 8384, 648, 3005, 574]]
[[578, 36684, 96614, 292, 11639, 374, 15609, 304, 279, 6246, 555, 279, 1646, 7091, 14675, 9204, 264, 50787, 315, 28036, 32125, 279, 84474, 323, 356, 2171, 77140, 555, 4211, 315, 65375, -100], [5929, 14588, 374, 264, 49770, 7701, 65765, 1370,

In [37]:
for i in tqdm(range(250)):
 
    input_ids=input_ids_tensor[i]
    labels=labels_tensor[i]
    attention_mask=attention_mask_tensor[i]
    optimizer.zero_grad()
    outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)


100%|██████████| 250/250 [00:06<00:00, 36.20it/s]
