In [1]:
from datasets import load_dataset, Dataset
ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")

In [2]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
device='cuda'
# モデルの準備
teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [3]:
train_dataset=ds["train"].shuffle(seed=42).select(range(1000))
validation_dataset=ds["validation"].shuffle(seed=42).select(range(300))
train_dataset = train_dataset["text"]
train_dataset = [item for item in train_dataset if item != '']
validation_dataset=validation_dataset["text"]
validation_dataset = [item for item in validation_dataset if item != '']


In [4]:
# 入力とラベルを設定
validation_data = []
for text in validation_dataset:
    tokenized = tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    validation_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
validation_dataset = Dataset.from_list(validation_data)

In [5]:
from tqdm import tqdm
from datasets import Dataset

# 入力とラベルを設定
train_data = []
for text in tqdm(train_dataset, desc="Tokenizing dataset"):
    tokenized = tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    train_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
train_dataset = Dataset.from_list(train_data)


Tokenizing dataset: 100%|██████████| 617/617 [00:00<00:00, 3289.93it/s]


In [6]:
print(train_dataset['input_ids'][10])
print(train_dataset['labels'][10])

[128000, 578, 2547, 430, 7437, 7353, 19092, 9021, 369, 30334, 374, 279, 30334, 87415, 12000, 320, 30334, 571, 12, 31, 13174, 883, 902, 20628, 520, 279, 25992, 9343, 1174, 323, 832, 1023, 2391, 279, 1060, 1174, 323, 374, 279, 2942, 430, 6089, 84261, 449, 279, 44250, 14726, 315, 279, 5426, 17047, 315, 279, 4562, 5415, 902, 9732, 57316, 17384, 1174, 477, 96662, 311, 30334, 662, 578, 30334, 87415, 12000, 374, 1903, 709, 315, 48962, 505, 279, 4562, 5961, 315, 279, 4892, 23179, 23590, 439, 1664, 439, 61759, 22712, 3697, 662, 35131, 362, 13, 33794, 388, 1174, 6063, 32724, 27390, 315, 279, 40007, 10554, 315, 279, 30708, 478, 351, 323, 264, 4562, 315, 279, 9052, 11650, 9323, 1174, 6244, 4872, 315, 279, 14956, 304, 220, 679, 15, 662, 1102]
[578, 2547, 430, 7437, 7353, 19092, 9021, 369, 30334, 374, 279, 30334, 87415, 12000, 320, 30334, 571, 12, 31, 13174, 883, 902, 20628, 520, 279, 25992, 9343, 1174, 323, 832, 1023, 2391, 279, 1060, 1174, 323, 374, 279, 2942, 430, 6089, 84261, 449, 279, 44250, 14

In [7]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

teacher_model.to(device)
# トレーニング設定
teacher_model.train()

training_args = TrainingArguments(
    output_dir="./trainteacher",
    eval_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
)

# Trainerのセットアップ
trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)



In [None]:
# 教師モデルのトレーニング
trainer.train()


Step,Training Loss


In [None]:
teacher_model.save_pretrained("./teacherlg")

In [26]:
teacher_model.eval()
eval_results=trainer.evaluate()
print(eval_results)

{'eval_loss': 4.124519348144531, 'eval_model_preparation_time': 0.0019, 'eval_runtime': 4.0364, 'eval_samples_per_second': 47.568, 'eval_steps_per_second': 5.946}


In [None]:
teacher_model.push_to_hub("llama-3epochs", private=True)