In [14]:
from datasets import load_dataset, Dataset
ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")

In [15]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
device='cuda'
# モデルの準備
teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [16]:
train_dataset=ds["train"].shuffle(seed=42).select(range(20000))
validation_dataset=ds["validation"].shuffle(seed=42).select(range(3500))
train_dataset = train_dataset["text"]
train_dataset = [item for item in train_dataset if item != '' and len(item) >= 50 and '@' not in item]
validation_dataset=validation_dataset["text"]
validation_dataset = [item for item in validation_dataset if item != '' and len(item) >= 50 and '@' not in item]


In [17]:
import re

train_dataset = [re.sub(r'[^a-zA-Z ]', '', item) for item in train_dataset]
train_dataset = [re.sub(r'\s+', ' ', item) for item in train_dataset]
validation_dataset = [re.sub(r'[^a-zA-Z ]', '', item) for item in validation_dataset]
validation_dataset = [re.sub(r'\s+', ' ', item) for item in validation_dataset]

In [24]:
train_dataset[665]

{'input_ids': [128000,
  9220,
  4106,
  47497,
  430,
  34234,
  25024,
  374,
  11033,
  10434,
  304,
  279,
  6693,
  323,
  6957,
  279,
  1938,
  323,
  15600,
  902,
  374,
  3629,
  274,
  6586,
  304,
  279,
  11714,
  31125,
  374,
  6118,
  88340,
  449,
  4415,
  41390,
  29437,
  477,
  296,
  640,
  72,
  4912,
  2701,
  39361,
  578,
  11033,
  315,
  5873,
  374,
  6118,
  24666,
  477,
  35217,
  11033,
  35217,
  11033,
  374,
  4528,
  311,
  24666,
  11033,
  719,
  279,
  4846,
  374,
  993,
  7725,
  449,
  3786,
  309,
  316,
  323,
  374,
  6118,
  7120,
  4589,
  6901,
  220,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  

In [19]:
# 入力とラベルを設定
validation_data = []
for text in validation_dataset:
    tokenized = tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    validation_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
validation_dataset = Dataset.from_list(validation_data)

In [20]:
from tqdm import tqdm
from datasets import Dataset

# 入力とラベルを設定
train_data = []
for text in tqdm(train_dataset, desc="Tokenizing dataset"):
    tokenized = tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    train_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
train_dataset = Dataset.from_list(train_data)


Tokenizing dataset: 100%|██████████| 4063/4063 [00:01<00:00, 3688.89it/s]


In [21]:
print(train_dataset['input_ids'][10])
print(train_dataset['labels'][10])

[128000, 10771, 311, 279, 5165, 6017, 14821, 315, 25431, 1369, 809, 285, 1101, 11224, 430, 40424, 462, 88, 77383, 3235, 449, 38577, 3714, 32743, 2836, 309, 38966, 374, 832, 315, 279, 1403, 12474, 36467, 315, 6617, 18341, 61801, 220, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009]
[10771, 311, 279, 5165, 6017, 14821, 315, 25431, 1369, 

In [22]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

teacher_model.to(device)
# トレーニング設定
teacher_model.train()

training_args = TrainingArguments(
    output_dir="./trainteacher",
    eval_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=5,
)

# Trainerのセットアップ
trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)



In [23]:
# 教師モデルのトレーニング
trainer.train()
teacher_model.save_pretrained("./teacherlg")

Step,Training Loss


KeyboardInterrupt: 

In [26]:
teacher_model.eval()
eval_results=trainer.evaluate()
print(eval_results)

{'eval_loss': 4.124519348144531, 'eval_model_preparation_time': 0.0019, 'eval_runtime': 4.0364, 'eval_samples_per_second': 47.568, 'eval_steps_per_second': 5.946}


In [None]:
teacher_model.push_to_hub("llama-3epochs", private=True)