In [1]:
from datasets import load_dataset, Dataset
ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")

In [2]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
device='cuda'
# モデルの準備
teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
train_dataset=ds["train"].shuffle(seed=42).select(range(30000))
validation_dataset=ds["validation"].shuffle(seed=42).select(range(5000))
train_dataset = train_dataset["text"]
train_dataset = [item for item in train_dataset if item != '' and len(item) >= 50 and '@' not in item]
validation_dataset=validation_dataset["text"]
validation_dataset = [item for item in validation_dataset if item != '']


In [4]:
import re

train_dataset = [re.sub(r'[^a-zA-Z0-9 ]', '', item) for item in train_dataset]
train_dataset = [re.sub(r'\s+', ' ', item) for item in train_dataset]

In [5]:
train_dataset

[' The Mesozoic era is represented in the park by the model rock exposure showing a succession of beds namely the Jurassic and Cretaceous by models of dinosaurs and other animals known from mesozoic fossils and by suitable vegetation both living plants and models ',
 ' White Dog is a blunt highly cinematic parable about race relations that questions whether racism is a curable mental illness or learned behavior or if it is an untreatable disease The unnamed white German Shepherd is the metaphor of racism with his radically contrasting moments of innocent typical dog behavior when not around black persons and his snarling viciousness when he sees a target Paul Winfield s character Keys who believes he can help the dog unlearn this behavior represents the view that racism can be unlearned Keys attempts to reprogram the dog become a bold literalization of the race war and as the film progresses Keys becomes obsessed with the idea that he can cure the dog Much like Captain Ahab he declares

In [6]:
# 入力とラベルを設定
validation_data = []
for text in validation_dataset:
    tokenized = tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    validation_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
validation_dataset = Dataset.from_list(validation_data)

In [7]:
from tqdm import tqdm
from datasets import Dataset

# 入力とラベルを設定
train_data = []
for text in tqdm(train_dataset, desc="Tokenizing dataset"):
    tokenized = tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    train_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
train_dataset = Dataset.from_list(train_data)


Tokenizing dataset: 100%|██████████| 2020/2020 [00:00<00:00, 3598.60it/s]


In [8]:
print(train_dataset['input_ids'][10])
print(train_dataset['labels'][10])

[128000, 10771, 311, 279, 5165, 6017, 14821, 315, 25431, 1369, 809, 285, 1101, 11224, 430, 40424, 462, 88, 77383, 3235, 449, 38577, 3714, 32743, 2836, 309, 38966, 374, 832, 315, 279, 1403, 12474, 36467, 315, 6617, 18341, 61801, 220, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009]
[10771, 311, 279, 5165, 6017, 14821, 315, 25431, 1369, 

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

teacher_model.to(device)
# トレーニング設定
teacher_model.train()

training_args = TrainingArguments(
    output_dir="./trainteacher",
    eval_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=5,
)

# Trainerのセットアップ
trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)



In [10]:
# 教師モデルのトレーニング
trainer.train()


Step,Training Loss


KeyboardInterrupt: 

In [None]:
teacher_model.save_pretrained("./teacherlg")

In [26]:
teacher_model.eval()
eval_results=trainer.evaluate()
print(eval_results)

{'eval_loss': 4.124519348144531, 'eval_model_preparation_time': 0.0019, 'eval_runtime': 4.0364, 'eval_samples_per_second': 47.568, 'eval_steps_per_second': 5.946}


In [None]:
teacher_model.push_to_hub("llama-3epochs", private=True)