In [5]:
from datasets import load_dataset, Dataset
ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")

In [2]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
device='cuda'
# モデルの準備
teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [6]:
train_dataset=ds["train"].shuffle(seed=42).select(range(30000))
validation_dataset=ds["validation"].shuffle(seed=42).select(range(3500))
train_dataset = train_dataset["text"]
train_dataset = [item for item in train_dataset if item != '' and len(item) >= 50 and '@' not in item]
validation_dataset=validation_dataset["text"]
validation_dataset = [item for item in validation_dataset if item != '' and len(item) >= 50 and '@' not in item]


In [None]:
import re

train_dataset = [re.sub(r'[^a-zA-Z0-9 ]', '', item) for item in train_dataset]
train_dataset = [re.sub(r'\s+', ' ', item) for item in train_dataset]
validation_dataset = [re.sub(r'[^a-zA-Z0-9 ]', '', item) for item in validation_dataset]
validation_dataset = [re.sub(r'\s+', ' ', item) for item in validation_dataset]

In [8]:
validation_dataset

[' Level IV and V : Direct connection ramps ( two levels ) , eliminating the left exits of the modified cloverleaf \n',
 ' At 21 : 30 on 12 September , the Japanese cruiser Sendai and three destroyers shelled the Lunga perimeter for 20 minutes and illuminated the ridge with a searchlight . Japanese artillery began shelling the Marine lines , but did little damage . At the same time , scattered groups of Kawaguchi \'s troops began skirmishing with Marines around the ridge . Kawaguchi \'s 1st Battalion — led by Major Yukichi Kokusho — attacked the Raider \'s " C " company between the lagoon and the Lunga River , overrunning at least one platoon and forcing the Marine company to fall back to the ridge . Kokusho \'s unit became entangled with troops from Kawaguchi \'s 3rd Battalion under Lieutenant Colonel Kusukichi Watanabe , who were still struggling to reach their attack positions , and the resulting confusion effectively stopped the Japanese attack on the ridge that night . Kawaguchi ,

In [9]:
# 入力とラベルを設定
validation_data = []
for text in validation_dataset:
    tokenized = tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    validation_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
validation_dataset = Dataset.from_list(validation_data)

In [10]:
from tqdm import tqdm
from datasets import Dataset

# 入力とラベルを設定
train_data = []
for text in tqdm(train_dataset, desc="Tokenizing dataset"):
    tokenized = tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    # 次の単語のインデックスをラベルとして追加
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # 最初の単語を除いて次の単語をラベルにする
    train_data.append({"input_ids": input_ids, "labels": labels})

# Datasetの作成
train_dataset = Dataset.from_list(train_data)


Tokenizing dataset: 100%|██████████| 6067/6067 [00:01<00:00, 3628.84it/s]


In [11]:
print(train_dataset['input_ids'][10])
print(train_dataset['labels'][10])

[128000, 10771, 311, 279, 5165, 6017, 14821, 315, 25431, 1369, 809, 285, 1101, 11224, 430, 40424, 462, 88, 77383, 3235, 449, 38577, 3714, 32743, 2836, 309, 38966, 374, 832, 315, 279, 1403, 12474, 36467, 315, 6617, 18341, 61801, 220, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009]
[10771, 311, 279, 5165, 6017, 14821, 315, 25431, 1369, 

In [12]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

teacher_model.to(device)
# トレーニング設定
teacher_model.train()

training_args = TrainingArguments(
    output_dir="./trainteacher",
    eval_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=5,
)

# Trainerのセットアップ
trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)



In [10]:
# 教師モデルのトレーニング
trainer.train()


Step,Training Loss


KeyboardInterrupt: 

In [None]:
teacher_model.save_pretrained("./teacherlg")

In [26]:
teacher_model.eval()
eval_results=trainer.evaluate()
print(eval_results)

{'eval_loss': 4.124519348144531, 'eval_model_preparation_time': 0.0019, 'eval_runtime': 4.0364, 'eval_samples_per_second': 47.568, 'eval_steps_per_second': 5.946}


In [None]:
teacher_model.push_to_hub("llama-3epochs", private=True)