In [1]:
# pip install --pre torch torchvision  --index-url https://download.pytorch.org/whl/nightly/cpu
# pip install torchvision
import torch
import transformers
from torch import nn
print(torch.__version__)
from transformers import AutoModelForCausalLM, AutoTokenizer

from peft import LoraConfig, TaskType
from peft import get_peft_model

2.5.0.dev20240911


Steps:
1. Fill the [form](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) to get access to llama models
2. Create a token in https://huggingface.co/settings/tokens
3. Run `huggingface-cli login`

In [2]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# quantization_config = BitsAndBytesConfig(load_in_8bit=True) 
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map = 'auto'
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.


In [3]:
input_text = "What is the meaning of life?"

# Tokenize and move input to the appropriate device (MPS or CPU)
inputs = tokenizer(input_text, return_tensors="pt").to('mps')

print(inputs['input_ids'].device, inputs['input_ids'].dtype)

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10, num_beams=1,)
    print(tokenizer.decode(outputs[0]))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


mps:0 torch.int64
<|begin_of_text|>What is the meaning of life? It's a question that has puzzled philosophers, theolog


In [4]:
# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    # quantization_config={"type": "int8"},  # Specify the quantization type
    device_map="auto",
)

print(pipeline.model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [5]:
#https://github.com/huggingface/peft
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=3, lora_dropout=0.1)
lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()
print(model)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=

  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [5]:
for name, param in lora_model.named_parameters():
    if 'attn' in name:
        print(name, param.shape, param.device)

NameError: name 'lora_model' is not defined

In [6]:

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

print("Calling pipeline")

# This does this
# pipeline.tokenizer(pipeline.tokenizer.apply_chat_template(messages, tokenize=False))
outputs = pipeline(
    messages,
    max_new_tokens=10,
)
print(outputs[0]["generated_text"][-1])  


Calling pipeline


NameError: name 'pipeline' is not defined

In [7]:
type(pipeline.tokenizer)

transformers.tokenization_utils_fast.PreTrainedTokenizerFast

In [11]:
tokenizer_input = pipeline.tokenizer.apply_chat_template(messages, tokenize=False)

In [12]:
pipeline.tokenizer.tokenize(tokenizer_input)

['<|begin_of_text|>',
 '<|start_header_id|>',
 'system',
 '<|end_header_id|>',
 'ĊĊ',
 'Cut',
 'ting',
 'ĠKnowledge',
 'ĠDate',
 ':',
 'ĠDecember',
 'Ġ',
 '202',
 '3',
 'Ċ',
 'Today',
 'ĠDate',
 ':',
 'Ġ',
 '26',
 'ĠJul',
 'Ġ',
 '202',
 '4',
 'ĊĊ',
 'You',
 'Ġare',
 'Ġa',
 'Ġpirate',
 'Ġchat',
 'bot',
 'Ġwho',
 'Ġalways',
 'Ġresponds',
 'Ġin',
 'Ġpirate',
 'Ġspeak',
 '!',
 '<|eot_id|>',
 '<|start_header_id|>',
 'user',
 '<|end_header_id|>',
 'ĊĊ',
 'Who',
 'Ġare',
 'Ġyou',
 '?',
 '<|eot_id|>']

In [16]:
pipeline.tokenizer(tokenizer_input)

{'input_ids': [128000, 128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1627, 10263, 220, 2366, 19, 271, 2675, 527, 264, 55066, 6369, 6465, 889, 2744, 31680, 304, 55066, 6604, 0, 128009, 128006, 882, 128007, 271, 15546, 527, 499, 30, 128009], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
pipeline.tokenizer.encode(tokenizer_input)

[128000,
 128000,
 128006,
 9125,
 128007,
 271,
 38766,
 1303,
 33025,
 2696,
 25,
 6790,
 220,
 2366,
 18,
 198,
 15724,
 2696,
 25,
 220,
 1627,
 10263,
 220,
 2366,
 19,
 271,
 2675,
 527,
 264,
 55066,
 6369,
 6465,
 889,
 2744,
 31680,
 304,
 55066,
 6604,
 0,
 128009,
 128006,
 882,
 128007,
 271,
 15546,
 527,
 499,
 30,
 128009]

In [15]:
pipeline.tokenizer.decode(pipeline.tokenizer.encode(tokenizer_input))

'<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|>'

In [14]:
pipeline.tokenizer.encode("Hello")

[128000, 9906]