In [32]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
model = AutoModelForCausalLM.from_pretrained("teacher")

In [33]:
# 質問の入力
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

device='cuda'
question = "Who are you?"
input_ids = tokenizer(question, return_tensors="pt", truncation=True, padding=True).to(device)
model.to(device)

attention_mask=input_ids['attention_mask']
print(attention_mask, input_ids)

output = model.generate(input_ids['input_ids'], attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id, max_length=100)
print(output)

# 出力のデコード
answer = tokenizer.decode(output[0], skip_special_tokens=True)
print(answer)

tensor([[1, 1, 1, 1, 1]], device='cuda:0') {'input_ids': tensor([[128000,  15546,    527,    499,     30]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1]], device='cuda:0')}
tensor([[128000,  15546,    527,    499,     30,    264,    220,    220,    220,
            220,     16,    279,    315,    311,    315,    315,   1174,    315,
            220,    220,    220,     20,    220,    220,    571,     31,    571,
             31,   1174,    220,     31,    220,     31,   1174,    279,   1174,
            279,   1174,    315,    315,    315,    220,     31,    279,    662,
            220,    220,   1174,    311,   1174,   1174,    279,    279,   1174,
            220,    220,    220,    220,     31,    279,    304,    279,    304,
            315,    311,    662, 128009]], device='cuda:0')
Who are you? a    1 the of to of of, of   5   @@ @@, @ @, the, the, of of of @ the.  , to,, the the,    @ the in the in of to.


In [34]:
question = "b b b"
input_ids = tokenizer(question, return_tensors="pt", truncation=True, padding=True).to(device)
output = model(input_ids['input_ids'], attention_mask=attention_mask)
print(output.logits.size())

torch.Size([1, 4, 128256])


In [30]:
import torch
probabilities = torch.softmax(output.logits[0][2], dim=-1)
print(probabilities)
predicted_token_ids = torch.argmax(probabilities, dim=-1)
print(predicted_token_ids)
decoded_text = tokenizer.decode(predicted_token_ids, skip_special_tokens=True)
print(decoded_text)

tensor([9.4916e-06, 7.2691e-06, 2.9318e-06,  ..., 1.1749e-07, 1.1751e-07,
        1.1748e-07], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor(220, device='cuda:0')
 


In [31]:
model.generation_config

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "temperature": 0.6,
  "top_p": 0.9
}

In [9]:
from torchinfo import summary

summary(model, depth=3)

Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   262,668,288
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      60,821,504
│    │    └─LlamaDecoderLayer: 3-2                      60,821,504
│    │    └─LlamaDecoderLayer: 3-3                      60,821,504
│    │    └─LlamaDecoderLayer: 3-4                      60,821,504
│    │    └─LlamaDecoderLayer: 3-5                      60,821,504
│    │    └─LlamaDecoderLayer: 3-6                      60,821,504
│    │    └─LlamaDecoderLayer: 3-7                      60,821,504
│    │    └─LlamaDecoderLayer: 3-8                      60,821,504
│    │    └─LlamaDecoderLayer: 3-9                      60,821,504
│    │    └─LlamaDecoderLayer: 3-10                     60,821,504
│    │    └─LlamaDeco

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distillLLAMA")
model = AutoModelForCausalLM.from_pretrained("distillLLAMA")

In [4]:
from torchinfo import summary

summary(model, depth=3)

Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   262,668,288
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      60,821,504
│    │    └─LlamaDecoderLayer: 3-2                      60,821,504
│    │    └─LlamaDecoderLayer: 3-3                      60,821,504
│    │    └─LlamaDecoderLayer: 3-4                      60,821,504
│    │    └─LlamaDecoderLayer: 3-5                      60,821,504
│    │    └─LlamaDecoderLayer: 3-6                      60,821,504
│    │    └─LlamaDecoderLayer: 3-7                      60,821,504
│    │    └─LlamaDecoderLayer: 3-8                      60,821,504
│    └─LlamaRMSNorm: 2-3                                2,048
│    └─LlamaRotaryEmbedding: 2-4                        --
├─Linear: 1-2                     

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distillLLAMA2")
model = AutoModelForCausalLM.from_pretrained("distillLLAMA2")
from torchinfo import summary

summary(model, depth=6)

  from .autonotebook import tqdm as notebook_tqdm


Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   262,668,288
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      60,821,504
│    │    └─LlamaDecoderLayer: 3-2                      60,821,504
│    │    └─LlamaDecoderLayer: 3-3                      60,821,504
│    │    └─LlamaDecoderLayer: 3-4                      60,821,504
│    └─LlamaRMSNorm: 2-3                                2,048
│    └─LlamaRotaryEmbedding: 2-4                        --
├─Linear: 1-2                                           262,668,288
Total params: 768,624,640
Trainable params: 768,624,640
Non-trainable params: 0