In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

In [None]:
# 質問の入力
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

device='cuda'
question = "Will stock prices rise in November?"
input_ids = tokenizer(question, return_tensors="pt", truncation=True, padding=True).to(device)
model.to(device)

attention_mask=input_ids['attention_mask']
print(attention_mask, input_ids)

output = model.generate(input_ids['input_ids'], attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id, max_length=100)
print(output)

# 出力のデコード
answer = tokenizer.decode(output[0], skip_special_tokens=True)
print(answer)

tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0') {'input_ids': tensor([[128000,  10149,   5708,   7729,  10205,    304,   6841,     30]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


tensor([[128000,  10149,   5708,   7729,  10205,    304,   6841,     30,   6841,
            374,    264,   1401,   2305,    369,    279,   2326,   5708,   3157,
             11,    323,   1690,  15167,    527,  23132,    264,   3831,   5178,
            304,    279,   2132,   4376,    315,    279,   1060,     13,   4452,
             11,    433,    596,    539,    264,  15803,    430,   5708,   7729,
            690,  10205,    304,   6841,    382,   8538,   3284,   8125,    369,
            264,  18174,    304,   5708,   7729,    304,   6841,   2997,   1473,
              9,    256,   3146,  39922,  53838,  96618,    578,   5708,   3157,
            649,    387,  17509,     11,    323,   6841,   1253,   3217,    264,
          76506,   4245,    311,   5370,   3157,   9547,     11,   1778,    439,
           7100,  34824,     11,   2802,   7969,     11,    477,  87998,   4455,
            627]], device='cuda:0')
Will stock prices rise in November? November is a key month for the US st

In [3]:
question = "b b"
input_ids = tokenizer(question, return_tensors="pt", truncation=True, padding=True).to(device)
output = model(input_ids['input_ids'], attention_mask=attention_mask)
print(output.logits.size())

torch.Size([1, 3, 128256])


In [4]:
import torch
probabilities = torch.softmax(output.logits[0][2], dim=-1)
print(probabilities)
predicted_token_ids = torch.argmax(probabilities, dim=-1)
print(predicted_token_ids)
decoded_text = tokenizer.decode(predicted_token_ids, skip_special_tokens=True)
print(decoded_text)

tensor([3.7209e-04, 8.4792e-05, 1.1225e-04,  ..., 3.2487e-08, 3.2505e-08,
        3.2504e-08], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor(293, device='cuda:0')
 b


In [5]:
model.generation_config

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "temperature": 0.6,
  "top_p": 0.9
}

In [9]:
from torchinfo import summary

summary(model, depth=3)

Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   262,668,288
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      60,821,504
│    │    └─LlamaDecoderLayer: 3-2                      60,821,504
│    │    └─LlamaDecoderLayer: 3-3                      60,821,504
│    │    └─LlamaDecoderLayer: 3-4                      60,821,504
│    │    └─LlamaDecoderLayer: 3-5                      60,821,504
│    │    └─LlamaDecoderLayer: 3-6                      60,821,504
│    │    └─LlamaDecoderLayer: 3-7                      60,821,504
│    │    └─LlamaDecoderLayer: 3-8                      60,821,504
│    │    └─LlamaDecoderLayer: 3-9                      60,821,504
│    │    └─LlamaDecoderLayer: 3-10                     60,821,504
│    │    └─LlamaDeco

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distillLLAMA")
model = AutoModelForCausalLM.from_pretrained("distillLLAMA")

In [4]:
from torchinfo import summary

summary(model, depth=3)

Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   262,668,288
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      60,821,504
│    │    └─LlamaDecoderLayer: 3-2                      60,821,504
│    │    └─LlamaDecoderLayer: 3-3                      60,821,504
│    │    └─LlamaDecoderLayer: 3-4                      60,821,504
│    │    └─LlamaDecoderLayer: 3-5                      60,821,504
│    │    └─LlamaDecoderLayer: 3-6                      60,821,504
│    │    └─LlamaDecoderLayer: 3-7                      60,821,504
│    │    └─LlamaDecoderLayer: 3-8                      60,821,504
│    └─LlamaRMSNorm: 2-3                                2,048
│    └─LlamaRotaryEmbedding: 2-4                        --
├─Linear: 1-2                     

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distillLLAMA2")
model = AutoModelForCausalLM.from_pretrained("distillLLAMA2")
from torchinfo import summary

summary(model, depth=3)

  from .autonotebook import tqdm as notebook_tqdm


Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   262,668,288
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      60,821,504
│    │    └─LlamaDecoderLayer: 3-2                      60,821,504
│    │    └─LlamaDecoderLayer: 3-3                      60,821,504
│    │    └─LlamaDecoderLayer: 3-4                      60,821,504
│    └─LlamaRMSNorm: 2-3                                2,048
│    └─LlamaRotaryEmbedding: 2-4                        --
├─Linear: 1-2                                           262,668,288
Total params: 768,624,640
Trainable params: 768,624,640
Non-trainable params: 0