In [1]:
from datasets import load_dataset, Dataset
ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")

In [2]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
device='cuda'
# モデルの準備
teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [3]:
train_dataset=ds["train"].shuffle(seed=42).select(range(1000))
validation_dataset=ds["validation"].shuffle(seed=42).select(range(300))
train_dataset = train_dataset["text"]
train_dataset = [item for item in train_dataset if item != '']
validation_dataset=validation_dataset["text"]
validation_dataset = [item for item in validation_dataset if item != '']


In [4]:
# 入力とラベルを設定
validation_data = []
for text in validation_dataset:
    tokenized = tokenizer(text, padding="max_length", max_length=256, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    validation_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
validation_dataset = Dataset.from_list(validation_data)

In [5]:
from tqdm import tqdm
from datasets import Dataset

# 入力とラベルを設定
train_data = []
for text in tqdm(train_dataset, desc="Tokenizing dataset"):
    tokenized = tokenizer(text, padding="max_length", max_length=256, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    train_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
train_dataset = Dataset.from_list(train_data)


Tokenizing dataset: 100%|██████████| 617/617 [00:00<00:00, 2829.06it/s]


In [6]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

teacher_model.to(device)
# トレーニング設定
teacher_model.train()

training_args = TrainingArguments(
    output_dir="./teacher_llama",
    eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    num_train_epochs=1,
)

# Trainerのセットアップ
trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)



In [None]:
# 教師モデルのトレーニング
trainer.train()
teacher_model.save_pretrained("./teacher")

Epoch,Training Loss,Validation Loss


In [None]:
teacher_model.eval()
eval_results=trainer.evaluate()

In [8]:
print(eval_results)

{'eval_loss': 4.476005554199219, 'eval_model_preparation_time': 0.0023, 'eval_runtime': 7.9403, 'eval_samples_per_second': 24.18, 'eval_steps_per_second': 3.023}
