In [1]:
import os

def get_all_directories(folder_path):
    directories = set()
    for root, _, files in os.walk(folder_path):
        if files:  # ファイルが存在する場合
            directories.add(root)
    return directories

# 使用例
folder_path = "./model/"
directories = get_all_directories(folder_path)
print("ディレクトリ名一覧:")
for directory in directories:
    print(directory)

ディレクトリ名一覧:
./model/teacher_model
./model/teacher_model_finetuned
./model/normal_model_distill
./model/distill_model_distill
./model/normal_model
./model/distill_model_QLoRA
./model/normal_model_QLoRA
./model/normal_model_train


In [4]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import optuna
import re
from tqdm import tqdm
import torch
from torch.nn import functional as F
from torch.optim import AdamW
import matplotlib.pyplot as plt
from torch import nn
import gc
import sys


ds = load_dataset("rajpurkar/squad")
device='cuda'
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

teacher_model = AutoModelForCausalLM.from_pretrained("./model/teacher_model2")

data_size = 1000
data_size_v = 600

size=100
size_v=100

torch.set_printoptions(profile="full")

def reshape(dataset, d_size):
    reshape_dataset = [0] * len(dataset)
    for i in range(len(dataset)):
        reshape_dataset[i]="C: "+dataset[i]["context"]+" Q: "+dataset[i]["question"]+" A: "+dataset[i]["answers"]["text"][0]
    reshape_dataset = [item for item in reshape_dataset if item != '' and len(item) >= 50 and '@' not in item]
    reshape_dataset = [re.sub(r'[^a-zA-Z0-9 .:?]', '', item) for item in reshape_dataset]
    reshape_dataset = [re.sub(r'\s+', ' ', item) for item in reshape_dataset]
    return reshape_dataset[:d_size]

def max_length(dataset):
    max_eval=0
    for i in dataset:
        max_eval = len(i) if len(i) > max_eval else max_eval
    print(max_eval)
    return

def batch(input, size):
    batch_train=[]
    for i in range(size):
        batch_input=[input[4*i+0], input[4*i+1], input[4*i+2], input[4*i+3]]
        batch_train.append(batch_input)

    return batch_train

def make_data(data, d_size):
    dataset=reshape(data, d_size)
    data = []
    for text in tqdm(dataset, desc="Tokenizing dataset"):
        if len(tokenizer(text)['input_ids']) <= 256:
            cq_len=len(tokenizer(text[:text.find("A:")])['input_ids'])
            tokenized = tokenizer(text, padding="max_length", max_length=256, truncation=True, return_tensors="pt")
            input_ids = tokenized['input_ids'].squeeze().tolist()
            attention_mask = tokenized['attention_mask'].squeeze().tolist()
            labels = input_ids[1:] + [tokenizer.pad_token_id]
            for i in range(min(cq_len-2, 256)):
                labels[i]=128001
            data.append({"input_ids": input_ids, "labels": labels, "attention_mask":attention_mask})
    
    return data

def make_tensor(data, type, size):
    tmp = [item[type] for item in data]
    tmp = batch(tmp, size)
    tensor=torch.tensor(tmp, dtype=torch.long)
    return tensor

train_dataset=ds["train"].shuffle(seed=42)

data = make_data(train_dataset, data_size)


input_ids_tensor = make_tensor(data, "input_ids", size)
labels_tensor = make_tensor(data, "labels", size)
attention_mask_tensor = make_tensor(data, "attention_mask", size)




Tokenizing dataset: 100%|██████████| 1000/1000 [00:00<00:00, 1086.48it/s]


In [6]:
labels_tensor[0]

tensor([[128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128

In [13]:
input_ids_tensor[0]

tensor([[128000,     34,     25,    578,  57717,  17997,    389,  44193,   3142,
           9601,  21467,  15212,    439,    279,  18172,  12047,   3224,    304,
            279,   1917,    369,  10597,  11542,     13,    578,   3723,   4273,
           9849,    389,   7327,  53176,  25320,    264,  52008,   9678,   9266,
            315,    279,   2326,   3109,    706,   9277,  15212,    389,   1202,
           3821,   1160,    315,   5961,    430,   1397,   3345,  16967,   4245,
            311,    279,   7138,    323,  13112,    315,  27655,    315,  10597,
          11542,  17045,    304,    477,  66441,    555,    279,   3109,     13,
          10771,    311,    264,    220,    679,     15,  57717,   8121,   7867,
          21237,  10795,    220,   5833,    315,  82604,  84721,   7396,    279,
           4648,  16750,    369,   1884,    889,   5387,  15256,    220,   2813,
           7396,    421,   2877,    826,    323,  14713,   1022,    315,   6206,
            369,  28483,    

In [7]:
tokenizer.decode(362)

' A'

In [8]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import re
from tqdm import tqdm
import torch
from torch.nn import functional as F
from torch.optim import AdamW
import matplotlib.pyplot as plt
from torch import nn



def reshape(dataset):
    reshape_dataset = [0] * len(dataset)
    for i in range(len(dataset)):
        reshape_dataset[i]="C: "+dataset[i]["context"]+" Q: "+dataset[i]["question"]+" A: "+dataset[i]["answers"]["text"][0]
    reshape_dataset = [item for item in reshape_dataset if item != '' and len(item) >= 50 and '@' not in item]
    reshape_dataset = [re.sub(r'[^a-zA-Z0-9 .:?]', '', item) for item in reshape_dataset]
    reshape_dataset = [re.sub(r'\s+', ' ', item) for item in reshape_dataset]
    return reshape_dataset[:data_size_v]

def max_length(dataset):
    max_eval=0
    for i in dataset:
        max_eval = len(i) if len(i) > max_eval else max_eval
    print(max_eval)
    return

def batch(input, size):
    batch_train=[]
    for i in range(size):
        batch_input=[input[4*i+0], input[4*i+1], input[4*i+2], input[4*i+3]]
        batch_train.append(batch_input)

    return batch_train

def devide(text):    
    cq = text[:(text.find("A:") + 3)]
    ans = text[(text.find("A:") + 3):]
    return [cq, ans]

def make_data(data):
    dataset=reshape(data)
    data = []
    for text in tqdm(dataset, desc="Tokenizing dataset"):
        [cq, ans] = devide(text)
        tokenized = tokenizer(cq, padding="max_length", max_length=256, truncation=True, return_tensors="pt")
        input_ids = tokenized['input_ids'].squeeze().tolist()
        attention_mask = tokenized['attention_mask'].squeeze().tolist()
        labels = input_ids[1:] + [tokenizer.pad_token_id]
        ans=tokenizer(ans, truncation=True, return_tensors="pt")
        ans = ans['input_ids'].squeeze().tolist()
        data.append({"input_ids": input_ids, "labels": labels, "attention_mask":attention_mask, "ans":ans})
    
    return data

def make_tensor(data, type, size):
    tmp = [item[type] for item in data]
    tmp = batch(tmp, size)
    tensor=torch.tensor(tmp, dtype=torch.long)
    return tensor

def accuracy(output_ids, ans, ignore_len):
    data_num=0
    acc_num=0
    for i in range(len(ans)-1):
        data_num += 1
        if output_ids[ignore_len+i]==ans[i+1]:
            
            acc_num += 1   
    return acc_num/data_num

def rouge(output_ids, ans, ignore_len):
    set_ans = set(ans)
    set_ans.remove(128000)
    output = output_ids[ignore_len:].tolist()
    set_out = set(output)
    
    score =  set_out & set_ans 
    return [len(score),len(set_ans)]

ds = load_dataset("rajpurkar/squad")
device='cuda'
model_normal = AutoModelForCausalLM.from_pretrained("./model/test1")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

data_size_v = 400

validation_dataset=ds["validation"].shuffle(seed=42)

data_v = make_data(validation_dataset)
size_v = int(len(data_v)/4)

input_ids_tensor_v = make_tensor(data_v, "input_ids", size_v)
labels_tensor_v = make_tensor(data_v, "labels", size_v)
attention_mask_tensor_v = make_tensor(data_v, "attention_mask", size_v)
ans = [data["ans"] for data in data_v]


vocab_size = model_normal.config.vocab_size
criterion = torch.nn.CrossEntropyLoss(ignore_index=128001)

criterion.to(device)

input_ids_tensor_v=input_ids_tensor_v.to(device)
labels_tensor_v=labels_tensor_v.to(device)
attention_mask_tensor_v=attention_mask_tensor_v.to(device)

Tokenizing dataset: 100%|██████████| 400/400 [00:00<00:00, 1794.27it/s]


In [11]:
labels_tensor_v[1]

tensor([[128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128000,     34,     25,   4740,    264,  32949,    505,   2225,   7411,
          13030,   2751,    389,   3839,    449,    264,    220,     24,   1387,
            220,   5958,  17884,  20413,   6678,     13,  21324,   8308,    220,
             19,    315,    220,     19,  16609,    369,    220,   3971,  14006,
            323,  33341,  11157,    369,    220,    914,  14006,   1418,  24150,
          29868,   8220,    279,   6678,    449,    264,    220,     16,  17884,
          26079,   1629,  14

In [12]:
ans

[[128000, 9741, 17],
 [128000, 5162, 17],
 [128000, 40701, 580, 14916, 69172],
 [128000, 325, 17998, 15481, 31252, 323, 23978],
 [128000, 66091, 29868],
 [128000, 966],
 [128000, 9028, 285, 3623, 13],
 [128000, 60003],
 [128000, 63626, 279, 2077, 311, 279, 48702],
 [128000, 5966, 8444, 33211, 27697],
 [128000, 17020, 3225, 22395, 1219, 7006, 39036, 45635],
 [128000, 82389, 10418, 77407],
 [128000, 19422, 15992, 31362],
 [128000, 69, 333, 15247],
 [128000, 9423, 15, 17569, 315, 279, 52248, 304, 29890],
 [128000, 33, 3145, 82008, 6164],
 [128000, 11719, 12248],
 [128000, 26375, 356, 349, 331, 2191],
 [128000, 677, 1216, 1077, 42563, 4856, 1109, 3823, 2383],
 [128000, 77405, 279, 73187, 783],
 [128000, 30115, 3116, 1667],
 [128000, 359, 1908, 66669],
 [128000, 84707, 12624, 323, 13042],
 [128000, 58524],
 [128000, 53770, 3610],
 [128000, 105516, 315, 79157],
 [128000, 15145, 4435, 5111, 358],
 [128000, 1885, 4776, 264, 5190, 19602],
 [128000, 35124],
 [128000, 47690, 66270],
 [128000, 366