In [106]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import re
from tqdm import tqdm
import torch
from torch.optim import AdamW
import matplotlib.pyplot as plt
from torch import nn

ds = load_dataset("mandarjoshi/trivia_qa", "rc.nocontext")
device='cuda'
# モデルの準備
student_model = AutoModelForCausalLM.from_pretrained("../newTrains/finetuned_model5")
teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

train_dataset=ds["train"].shuffle(seed=42).select(range(1000))
validation_dataset=ds["validation"].shuffle(seed=42)


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

In [107]:
train_dataset

Dataset({
    features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'],
    num_rows: 1000
})

In [108]:
train_datas = []

for i in tqdm(range(1000)):
    # 質問と答えを辞書形式で保存
    train_data = {
        'question': train_dataset['question'][i],
        'answer': train_dataset['answer'][i]['aliases'][0]
    }
    train_datas.append(train_data)

100%|██████████| 1000/1000 [00:38<00:00, 26.13it/s]


In [109]:
# アルファベット以外の文字を除去する関数
def clean_text(text):
    return re.sub(r'[^a-zA-Z ?]', '', text)

# 質問と回答を清掃
for data in train_datas:
    data['question'] = clean_text(data['question'])
    data['answer'] = clean_text(data['answer'])

In [110]:
len(train_datas)

1000

In [None]:
train_data = []
for text in tqdm(train_datas, desc="Tokenizing dataset"):
    tokenized = tokenizer(text['question'] + " A" + text['answer'], padding="max_length", max_length=64, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    attention_mask = tokenized['attention_mask'].squeeze().tolist()
    quest_pos = len(tokenizer(text['question'])['input_ids'])
    attention_mask[:quest_pos] = [0] * quest_pos
    labels = input_ids[1:] + [tokenizer.pad_token_id]
    for i in range(len(attention_mask)):
        if attention_mask[i]==0:
            labels[i]=-100


    train_data.append({"input_ids": input_ids, "labels": labels, "attention_mask":attention_mask})

Tokenizing dataset:  10%|█         | 101/1000 [00:00<00:00, 4654.60it/s]


IndexError: list assignment index out of range

In [112]:
train_data

[{'input_ids': [128000,
   23956,
   10334,
   5415,
   430,
   1274,
   8541,
   311,
   10205,
   311,
   872,
   1866,
   2237,
   315,
   99748,
   30,
   48388,
   1291,
   1132,
   31226,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009,
   128009],
  'labels': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1291,
   1132,
   31226,
   128009,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
  

In [113]:
text={'question':"What is the color of an apple?", 'answer': 'Red'}
train_data = []
tokenized = tokenizer(text['question'] + " A" + text['answer'], padding="max_length", max_length=32, truncation=True, return_tensors="pt")
input_ids = tokenized['input_ids'].squeeze().tolist()
attention_mask = tokenized['attention_mask'].squeeze().tolist()
quest_pos = len(tokenizer(text['question'])['input_ids'])
attention_mask[:quest_pos] = [0] * quest_pos
labels = input_ids[1:] + [tokenizer.pad_token_id]
labels[-1]=-100
labels[attention_mask == 0] = -100
train_data.append({"input_ids": input_ids, "labels": labels, "attention_mask":attention_mask})

In [116]:
train_data[0]['attention_mask']

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [53]:
quest=tokenizer(text['question'])
quest

{'input_ids': [128000, 3923, 374, 279, 1933, 315, 459, 24149, 30], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}