In [28]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import re
from tqdm import tqdm
import torch
from torch.optim import AdamW
import matplotlib.pyplot as plt
from torch import nn

ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")
device='cuda'
# モデルの準備
teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
student_model = AutoModelForCausalLM.from_pretrained("./model/initialized_distill_model")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [29]:
print(tokenizer.pad_token_id)
    

128001


In [30]:
from torchinfo import summary
summary(teacher_model, depth=3)


Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   262,668,288
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      60,821,504
│    │    └─LlamaDecoderLayer: 3-2                      60,821,504
│    │    └─LlamaDecoderLayer: 3-3                      60,821,504
│    │    └─LlamaDecoderLayer: 3-4                      60,821,504
│    │    └─LlamaDecoderLayer: 3-5                      60,821,504
│    │    └─LlamaDecoderLayer: 3-6                      60,821,504
│    │    └─LlamaDecoderLayer: 3-7                      60,821,504
│    │    └─LlamaDecoderLayer: 3-8                      60,821,504
│    │    └─LlamaDecoderLayer: 3-9                      60,821,504
│    │    └─LlamaDecoderLayer: 3-10                     60,821,504
│    │    └─LlamaDeco

In [31]:
summary(student_model, depth=3)

Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   262,668,288
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      60,821,504
│    │    └─LlamaDecoderLayer: 3-2                      60,821,504
│    │    └─LlamaDecoderLayer: 3-3                      60,821,504
│    │    └─LlamaDecoderLayer: 3-4                      60,821,504
│    │    └─LlamaDecoderLayer: 3-5                      60,821,504
│    │    └─LlamaDecoderLayer: 3-6                      60,821,504
│    │    └─LlamaDecoderLayer: 3-7                      60,821,504
│    │    └─LlamaDecoderLayer: 3-8                      60,821,504
│    └─LlamaRMSNorm: 2-3                                2,048
│    └─LlamaRotaryEmbedding: 2-4                        --
├─Linear: 1-2                     

In [4]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import re
from tqdm import tqdm
import torch
from torch.nn import functional as F
from torch.optim import AdamW
import matplotlib.pyplot as plt
from torch import nn

ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")
device='cuda'
# モデルの準備
teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("../model/initialized_distill_model2")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

data_size = 100
size = int(data_size/4)

train_dataset=ds["train"].shuffle(seed=42).select(range(1000))

def reshape(dataset):
    dataset=dataset["text"]
    dataset = [item for item in dataset if item != '' and len(item) >= 50 and '@' not in item]
    dataset = [re.sub(r'[^a-zA-Z0-9 ?]', '', item) for item in dataset]
    dataset = [re.sub(r'\s+', ' ', item) for item in dataset]
    print(len(dataset))
    return dataset[:data_size]

def max_length(dataset):
    max_eval=0
    for i in dataset:
        max_eval = len(i) if len(i) > max_eval else max_eval
    print(max_eval)
    return


dataset=reshape(train_dataset)
max_length(dataset)

def batch(input):
    batch_train=[]
    for i in range(size):
        batch_input=[input[4*i+0], input[4*i+1], input[4*i+2], input[4*i+3]]
        batch_train.append(batch_input)

    return batch_train

# 入力とラベルを設定
data = []
for text in tqdm(dataset, desc="Tokenizing dataset"):
    tokenized = tokenizer(text, padding="max_length", max_length=256, truncation=True, return_tensors="pt")
    input_ids = tokenized['input_ids'].squeeze().tolist()
    attention_mask = tokenized['attention_mask'].squeeze().tolist()
    labels = input_ids[1:] + [tokenizer.pad_token_id]
    data.append({"input_ids": input_ids, "labels": labels, "attention_mask":attention_mask})


input_ids = [item["input_ids"] for item in data]
labels = [item["labels"] for item in data]
attention_mask = [item["attention_mask"] for item in data]

input_ids = batch(input_ids)
labels = batch(labels)
attention_mask = batch(attention_mask)

input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
labels_tensor = torch.tensor(labels, dtype=torch.long)
attention_mask_tensor = torch.tensor(attention_mask, dtype=torch.long)


# 仮定: ボキャブラリサイズと頻出語のトークンIDを定義
vocab_size = model.config.vocab_size

# クロスエントロピー損失関数の設定
criterion = torch.nn.CrossEntropyLoss(ignore_index=128001)
criterion.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
input_ids_tensor=input_ids_tensor.to(device)
labels_tensor=labels_tensor.to(device)
attention_mask_tensor = attention_mask_tensor.to(device)
model.to(device)
teacher_model.to(device)
model.eval()
criterion.to(device)
alpha=0.5
temperature=1.0
epochs = 1

i=0

input_ids=input_ids_tensor[i]
labels=labels_tensor[i]
attention_mask=attention_mask_tensor[i]
optimizer.zero_grad()
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
logits=outputs.logits
with torch.no_grad():
    student_prob=F.log_softmax(logits, dim=-1)
    teacher_outputs_logits=teacher_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).logits
    teacher_prob=F.softmax(teacher_outputs_logits, dim=-1)

kldiv_loss=F.kl_div(student_prob, teacher_prob, reduction="none")
kl_div_per_token = kldiv_loss.sum(dim=-1)
kl_loss=(kl_div_per_token * attention_mask).sum()/attention_mask.sum()

print(kl_loss)

191
1456


Tokenizing dataset: 100%|██████████| 100/100 [00:00<00:00, 2665.33it/s]


tensor(8.6776, device='cuda:0')


In [5]:
kldiv_loss.size()

torch.Size([4, 256, 128256])

In [6]:
kl_div_per_token.size()

torch.Size([4, 256])

In [7]:
labels.view(-1).size(0)

512

In [7]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import re
from tqdm import tqdm
import torch
from torch.nn import functional as F
from torch.optim import AdamW
import matplotlib.pyplot as plt
from torch import nn

ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")
device='cuda'
# モデルの準備
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

data_size = 100
size = int(data_size/4)

train_dataset=ds["train"].select(range(1000))

In [8]:
train_dataset[0]

{'text': ''}

In [10]:
train_dataset[1]

{'text': ' = Valkyria Chronicles III = \n'}

In [11]:
train_dataset[2]

{'text': ''}

In [17]:
tokenizer.decode(tokenizer(train_dataset[3]["text"])['input_ids'])

'<|begin_of_text|> Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3, lit. Valkyria of the Battlefield 3 ), commonly referred to as Valkyria Chronicles III outside Japan, is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable. Released in January 2011 in Japan, it is the third game in the Valkyria series. Employing the same fusion of tactical and real @-@ time gameplay as its predecessors, the story runs parallel to the first game and follows the " Nameless ", a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven ". \n'

In [12]:
attention_mask.size()

torch.Size([4, 128])

In [10]:
kldiv_loss

tensor(7.4231e-05, device='cuda:0')

In [13]:
kldiv_loss * attention_mask

RuntimeError: The size of tensor a (128256) must match the size of tensor b (256) at non-singleton dimension 2

In [14]:
kl_div_per_token = kldiv_loss.sum(dim=-1)

In [17]:
kl_div_per_token

tensor([[ 7.4868,  3.5298,  8.3883,  ..., 12.7773, 12.8699, 13.3185],
        [ 7.4868,  5.3422,  5.7633,  ..., 12.2131, 12.1325, 12.3083],
        [ 7.4868,  9.4952,  6.7543,  ..., 11.2674, 11.4796, 12.0311],
        [ 7.4868, 10.5451,  8.2415,  ..., 11.1294, 11.3222, 11.2411]],
       device='cuda:0')

In [19]:
(kl_div_per_token * attention_mask).sum()/attention_mask.sum()

tensor(8.6776, device='cuda:0')

In [2]:
kl_div_per_token.sum()/attention_mask.sum()

tensor(23.5073, device='cuda:0')

In [4]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import re
from tqdm import tqdm
import torch
from torch.nn import functional as F
from torch.optim import AdamW
import matplotlib.pyplot as plt
from torch import nn

ds = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")
device='cuda'
# モデルの準備
model = AutoModelForCausalLM.from_pretrained("./model/initialized_distill_model2")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [5]:
print(tokenizer.pad_token_id)

128001


In [8]:
sums=[1,2,3,5,5,[0]*10]*10
with open("output.txt", "w") as f:
            f.write(str(sums))