# Tested on a g5.4xlarge

In [7]:
%pip install -U torch==2.0.1 \
  transformers==4.33.0 \
  sentencepiece==0.1.99 \
  accelerate==0.22.0 # needed for low_cpu_mem_usage parameter

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers==4.33.0
  Obtaining dependency information for transformers==4.33.0 from https://files.pythonhosted.org/packages/e1/9d/4d9fe5c3b820db10773392ac5f4a0c8dab668f70b245ce2ce09785166128/transformers-4.33.0-py3-none-any.whl.metadata
  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate==0.22.0
  Obtaining dependency information for accelerate==0.22.0 from https://files.pythonhosted.org/packages/4d/a7/05c67003d659a0035f2b3a8cf389c1d9645865aee84a73ce99ddab16682f/accelerate-0.22.0-py3-none-any.whl.metadata
  Downloading accelerate-0.22.0-py3-none-any.whl.metadata (17 kB)
Downloading transformers-4.33.0-py3-none-any.whl (7.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m174.3 MB/s[0m eta [36m0:00:00[0ma 

In [1]:
import torch
from transformers import LlamaTokenizer

model_checkpoint = "NousResearch/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_checkpoint)

  from .autonotebook import tqdm as notebook_tqdm
Downloading tokenizer.model: 100%|██████████████████████████████████████████████████████████████████████████████████| 500k/500k [00:00<00:00, 2.25MB/s]
Downloading (…)cial_tokens_map.json: 100%|████████████████████████████████████████████████████████████████████████████| 435/435 [00:00<00:00, 1.84MB/s]
Downloading tokenizer_config.json: 100%|██████████████████████████████████████████████████████████████████████████████| 746/746 [00:00<00:00, 3.13MB/s]


In [2]:
# based on https://github.com/viniciusarruda/llama-cpp-chat-completion-wrapper/blob/1c9e29b70b1aaa7133d3c7d7b59a92d840e92e6d/llama_cpp_chat_completion_wrapper.py

from typing import List
from typing import Literal
from typing import TypedDict

from transformers import PreTrainedTokenizer

Role = Literal["system", "user", "assistant"]

class Message(TypedDict):
    role: Role
    content: str

MessageList = List[Message]

BEGIN_INST, END_INST = "[INST] ", " [/INST] "
BEGIN_SYS, END_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

def convert_list_of_message_lists_to_input_prompt(list_of_message_lists: List[MessageList], tokenizer: PreTrainedTokenizer) -> List[str]:
    input_prompts: List[str] = []
    print(type(list_of_message_lists))
    print(type(list_of_message_lists[0]))    
    for message_list in list_of_message_lists:
        if message_list[0]["role"] == "system":
            content = "".join([BEGIN_SYS, message_list[0]["content"], END_SYS, message_list[1]["content"]])
            message_list = [{"role": message_list[1]["role"], "content": content}] + message_list[2:]

        if not (
            all([msg["role"] == "user" for msg in message_list[::2]])
            and all([msg["role"] == "assistant" for msg in message_list[1::2]])
        ):
            raise ValueError(
                "Format must be in this order: 'system', 'user', 'assistant' roles.\nAfter that, you can alternate between user and assistant multiple times"
            )

        eos = tokenizer.eos_token
        bos = tokenizer.bos_token
        input_prompt = "".join(
            [
                "".join([bos, BEGIN_INST, (prompt["content"]).strip(), END_INST, (answer["content"]).strip(), eos])
                for prompt, answer in zip(message_list[::2], message_list[1::2])
            ]
        )

        if message_list[-1]["role"] != "user":
            raise ValueError(f"Last message must be from user role. Instead, you sent from {message_list[-1]['role']} role")

        input_prompt += "".join([bos, BEGIN_INST, (message_list[-1]["content"]).strip(), END_INST])

        input_prompts.append(input_prompt)

    return input_prompts

In [3]:
system_message = Message()
system_message["role"] = "system"
system_message["content"] = "Answer only with emojis"
print(system_message)

user_message = Message()
user_message["role"] = "user"
user_message["content"] = "Who won the 2016 baseball World Series?"
print(user_message)

# assistant_message = Message()
# assistant_message.role = "assistant"
# assistant_message.content = ""

list_of_messages = list()
list_of_messages.append(system_message)
list_of_messages.append(user_message)

list_of_message_lists = list()
list_of_message_lists.append(list_of_messages)

prompt = convert_list_of_message_lists_to_input_prompt(list_of_message_lists, tokenizer)
print(prompt)

{'role': 'system', 'content': 'Answer only with emojis'}
{'role': 'user', 'content': 'Who won the 2016 baseball World Series?'}
<class 'list'>
<class 'list'>
['<s>[INST] <<SYS>>\nAnswer only with emojis\n<</SYS>>\n\nWho won the 2016 baseball World Series? [/INST] ']


In [4]:
from transformers import LlamaForCausalLM

model = LlamaForCausalLM.from_pretrained(
    model_checkpoint,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
)


Downloading config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████| 583/583 [00:00<00:00, 2.53MB/s]
Downloading (…)fetensors.index.json: 100%|████████████████████████████████████████████████████████████████████████| 26.8k/26.8k [00:00<00:00, 35.3MB/s]
Downloading shards:   0%|                                                                                                        | 0/2 [00:00<?, ?it/s]
Downloading (…)of-00002.safetensors:   0%|                                                                                 | 0.00/9.98G [00:00<?, ?B/s][A
Downloading (…)of-00002.safetensors:   0%|                                                                        | 10.5M/9.98G [00:00<05:54, 28.1MB/s][A
Downloading (…)of-00002.safetensors:   0%|▏                                                                       | 21.0M/9.98G [00:00<07:38, 21.7MB/s][A
Downloading (…)of-00002.safetensors:   0%|▏                                    

Downloading (…)of-00002.safetensors:   5%|███▊                                                                     | 524M/9.98G [00:13<03:04, 51.3MB/s][A
Downloading (…)of-00002.safetensors:   5%|███▉                                                                     | 535M/9.98G [00:13<03:14, 48.5MB/s][A
Downloading (…)of-00002.safetensors:   5%|███▉                                                                     | 545M/9.98G [00:13<03:11, 49.3MB/s][A
Downloading (…)of-00002.safetensors:   6%|████                                                                     | 556M/9.98G [00:13<02:42, 57.9MB/s][A
Downloading (…)of-00002.safetensors:   6%|████▏                                                                    | 566M/9.98G [00:14<03:41, 42.6MB/s][A
Downloading (…)of-00002.safetensors:   6%|████▏                                                                    | 577M/9.98G [00:14<03:14, 48.4MB/s][A
Downloading (…)of-00002.safetensors:   6%|████▎                       

Downloading (…)of-00002.safetensors:  11%|███████▊                                                                | 1.08G/9.98G [00:26<03:24, 43.6MB/s][A
Downloading (…)of-00002.safetensors:  11%|███████▊                                                                | 1.09G/9.98G [00:26<02:52, 51.4MB/s][A
Downloading (…)of-00002.safetensors:  11%|███████▉                                                                | 1.10G/9.98G [00:26<03:36, 41.0MB/s][A
Downloading (…)of-00002.safetensors:  11%|████████                                                                | 1.11G/9.98G [00:27<03:00, 49.1MB/s][A
Downloading (…)of-00002.safetensors:  11%|████████                                                                | 1.12G/9.98G [00:27<03:36, 40.9MB/s][A
Downloading (…)of-00002.safetensors:  11%|████████▏                                                               | 1.13G/9.98G [00:28<05:38, 26.2MB/s][A
Downloading (…)of-00002.safetensors:  11%|████████▏                   

Downloading (…)of-00002.safetensors:  16%|███████████▋                                                            | 1.63G/9.98G [00:40<02:31, 55.1MB/s][A
Downloading (…)of-00002.safetensors:  16%|███████████▊                                                            | 1.64G/9.98G [00:40<03:16, 42.5MB/s][A
Downloading (…)of-00002.safetensors:  17%|███████████▉                                                            | 1.66G/9.98G [00:40<02:30, 55.4MB/s][A
Downloading (…)of-00002.safetensors:  17%|████████████                                                            | 1.67G/9.98G [00:41<02:53, 48.0MB/s][A
Downloading (…)of-00002.safetensors:  17%|████████████                                                            | 1.68G/9.98G [00:41<02:32, 54.5MB/s][A
Downloading (…)of-00002.safetensors:  17%|████████████▏                                                           | 1.69G/9.98G [00:41<03:21, 41.1MB/s][A
Downloading (…)of-00002.safetensors:  17%|████████████▎               

Downloading (…)of-00002.safetensors:  22%|███████████████▋                                                        | 2.18G/9.98G [00:53<02:53, 45.0MB/s][A
Downloading (…)of-00002.safetensors:  22%|███████████████▊                                                        | 2.19G/9.98G [00:54<03:18, 39.2MB/s][A
Downloading (…)of-00002.safetensors:  22%|███████████████▉                                                        | 2.20G/9.98G [00:54<02:43, 47.7MB/s][A
Downloading (…)of-00002.safetensors:  22%|███████████████▉                                                        | 2.21G/9.98G [00:54<04:08, 31.3MB/s][A
Downloading (…)of-00002.safetensors:  22%|████████████████                                                        | 2.22G/9.98G [00:55<05:35, 23.1MB/s][A
Downloading (…)of-00002.safetensors:  22%|████████████████                                                        | 2.23G/9.98G [00:55<05:11, 24.9MB/s][A
Downloading (…)of-00002.safetensors:  22%|████████████████▏           

Downloading (…)of-00002.safetensors:  28%|███████████████████▊                                                    | 2.75G/9.98G [01:08<03:02, 39.5MB/s][A
Downloading (…)of-00002.safetensors:  28%|███████████████████▉                                                    | 2.76G/9.98G [01:08<02:32, 47.2MB/s][A
Downloading (…)of-00002.safetensors:  28%|███████████████████▉                                                    | 2.77G/9.98G [01:08<02:10, 55.3MB/s][A
Downloading (…)of-00002.safetensors:  28%|████████████████████                                                    | 2.78G/9.98G [01:08<02:45, 43.6MB/s][A
Downloading (…)of-00002.safetensors:  28%|████████████████████▏                                                   | 2.79G/9.98G [01:08<02:17, 52.4MB/s][A
Downloading (…)of-00002.safetensors:  28%|████████████████████▏                                                   | 2.80G/9.98G [01:09<02:22, 50.3MB/s][A
Downloading (…)of-00002.safetensors:  28%|████████████████████▎       

Downloading (…)of-00002.safetensors:  33%|███████████████████████▊                                                | 3.30G/9.98G [01:22<05:04, 21.9MB/s][A
Downloading (…)of-00002.safetensors:  33%|███████████████████████▉                                                | 3.31G/9.98G [01:22<04:55, 22.5MB/s][A
Downloading (…)of-00002.safetensors:  33%|███████████████████████▉                                                | 3.32G/9.98G [01:23<04:12, 26.3MB/s][A
Downloading (…)of-00002.safetensors:  33%|████████████████████████                                                | 3.33G/9.98G [01:23<04:09, 26.6MB/s][A
Downloading (…)of-00002.safetensors:  34%|████████████████████████▏                                               | 3.34G/9.98G [01:23<03:29, 31.7MB/s][A
Downloading (…)of-00002.safetensors:  34%|████████████████████████▏                                               | 3.36G/9.98G [01:24<03:24, 32.3MB/s][A
Downloading (…)of-00002.safetensors:  34%|████████████████████████▎   

Downloading (…)of-00002.safetensors:  39%|███████████████████████████▉                                            | 3.87G/9.98G [01:36<02:33, 39.7MB/s][A
Downloading (…)of-00002.safetensors:  39%|███████████████████████████▉                                            | 3.88G/9.98G [01:36<02:08, 47.3MB/s][A
Downloading (…)of-00002.safetensors:  39%|████████████████████████████                                            | 3.89G/9.98G [01:36<01:52, 54.2MB/s][A
Downloading (…)of-00002.safetensors:  39%|████████████████████████████▏                                           | 3.90G/9.98G [01:36<02:15, 44.7MB/s][A
Downloading (…)of-00002.safetensors:  39%|████████████████████████████▏                                           | 3.91G/9.98G [01:36<01:57, 51.8MB/s][A
Downloading (…)of-00002.safetensors:  39%|████████████████████████████▎                                           | 3.92G/9.98G [01:37<02:26, 41.4MB/s][A
Downloading (…)of-00002.safetensors:  39%|████████████████████████████

Downloading (…)of-00002.safetensors:  44%|████████████████████████████████                                        | 4.44G/9.98G [01:51<02:06, 43.8MB/s][A
Downloading (…)of-00002.safetensors:  45%|████████████████████████████████                                        | 4.45G/9.98G [01:51<01:47, 51.6MB/s][A
Downloading (…)of-00002.safetensors:  45%|████████████████████████████████▏                                       | 4.46G/9.98G [01:51<02:18, 39.7MB/s][A
Downloading (…)of-00002.safetensors:  45%|████████████████████████████████▏                                       | 4.47G/9.98G [01:51<01:55, 47.6MB/s][A
Downloading (…)of-00002.safetensors:  45%|████████████████████████████████▎                                       | 4.48G/9.98G [01:52<01:58, 46.3MB/s][A
Downloading (…)of-00002.safetensors:  45%|████████████████████████████████▍                                       | 4.49G/9.98G [01:52<01:57, 46.8MB/s][A
Downloading (…)of-00002.safetensors:  45%|████████████████████████████

Downloading (…)of-00002.safetensors:  50%|████████████████████████████████████                                    | 4.99G/9.98G [02:05<02:14, 37.1MB/s][A
Downloading (…)of-00002.safetensors:  50%|████████████████████████████████████                                    | 5.00G/9.98G [02:05<02:51, 29.0MB/s][A
Downloading (…)of-00002.safetensors:  50%|████████████████████████████████████▏                                   | 5.01G/9.98G [02:06<03:02, 27.2MB/s][A
Downloading (…)of-00002.safetensors:  50%|████████████████████████████████████▏                                   | 5.02G/9.98G [02:06<02:24, 34.2MB/s][A
Downloading (…)of-00002.safetensors:  50%|████████████████████████████████████▎                                   | 5.03G/9.98G [02:06<02:57, 27.9MB/s][A
Downloading (…)of-00002.safetensors:  51%|████████████████████████████████████▍                                   | 5.04G/9.98G [02:07<04:06, 20.0MB/s][A
Downloading (…)of-00002.safetensors:  51%|████████████████████████████

Downloading (…)of-00002.safetensors:  56%|████████████████████████████████████████                                | 5.56G/9.98G [02:20<01:42, 43.1MB/s][A
Downloading (…)of-00002.safetensors:  56%|████████████████████████████████████████▏                               | 5.57G/9.98G [02:20<01:26, 50.9MB/s][A
Downloading (…)of-00002.safetensors:  56%|████████████████████████████████████████▎                               | 5.58G/9.98G [02:20<01:32, 47.5MB/s][A
Downloading (…)of-00002.safetensors:  56%|████████████████████████████████████████▎                               | 5.59G/9.98G [02:20<01:31, 47.8MB/s][A
Downloading (…)of-00002.safetensors:  56%|████████████████████████████████████████▍                               | 5.60G/9.98G [02:21<02:03, 35.5MB/s][A
Downloading (…)of-00002.safetensors:  56%|████████████████████████████████████████▍                               | 5.61G/9.98G [02:21<01:40, 43.3MB/s][A
Downloading (…)of-00002.safetensors:  56%|████████████████████████████

Downloading (…)of-00002.safetensors:  61%|████████████████████████████████████████████                            | 6.10G/9.98G [02:32<01:24, 46.0MB/s][A
Downloading (…)of-00002.safetensors:  61%|████████████████████████████████████████████                            | 6.11G/9.98G [02:33<01:31, 42.1MB/s][A
Downloading (…)of-00002.safetensors:  61%|████████████████████████████████████████████▏                           | 6.12G/9.98G [02:33<01:22, 46.9MB/s][A
Downloading (…)of-00002.safetensors:  61%|████████████████████████████████████████████▎                           | 6.13G/9.98G [02:33<01:56, 33.0MB/s][A
Downloading (…)of-00002.safetensors:  62%|████████████████████████████████████████████▎                           | 6.14G/9.98G [02:34<02:56, 21.8MB/s][A
Downloading (…)of-00002.safetensors:  62%|████████████████████████████████████████████▍                           | 6.17G/9.98G [02:35<02:02, 31.1MB/s][A
Downloading (…)of-00002.safetensors:  62%|████████████████████████████

Downloading (…)of-00002.safetensors:  67%|████████████████████████████████████████████████                        | 6.66G/9.98G [02:47<01:11, 46.4MB/s][A
Downloading (…)of-00002.safetensors:  67%|████████████████████████████████████████████████▏                       | 6.67G/9.98G [02:47<01:09, 47.8MB/s][A
Downloading (…)of-00002.safetensors:  67%|████████████████████████████████████████████████▏                       | 6.68G/9.98G [02:48<01:02, 52.5MB/s][A
Downloading (…)of-00002.safetensors:  67%|████████████████████████████████████████████████▎                       | 6.69G/9.98G [02:48<01:18, 41.8MB/s][A
Downloading (…)of-00002.safetensors:  67%|████████████████████████████████████████████████▎                       | 6.70G/9.98G [02:48<01:05, 50.0MB/s][A
Downloading (…)of-00002.safetensors:  67%|████████████████████████████████████████████████▍                       | 6.71G/9.98G [02:48<01:17, 42.2MB/s][A
Downloading (…)of-00002.safetensors:  67%|████████████████████████████

Downloading (…)of-00002.safetensors:  72%|███████████████████████████████████████████████████▉                    | 7.20G/9.98G [03:02<01:17, 36.0MB/s][A
Downloading (…)of-00002.safetensors:  72%|████████████████████████████████████████████████████                    | 7.21G/9.98G [03:02<01:08, 40.5MB/s][A
Downloading (…)of-00002.safetensors:  72%|████████████████████████████████████████████████████▏                   | 7.22G/9.98G [03:02<00:58, 46.7MB/s][A
Downloading (…)of-00002.safetensors:  73%|████████████████████████████████████████████████████▏                   | 7.24G/9.98G [03:03<01:08, 39.8MB/s][A
Downloading (…)of-00002.safetensors:  73%|████████████████████████████████████████████████████▎                   | 7.25G/9.98G [03:03<00:59, 46.1MB/s][A
Downloading (…)of-00002.safetensors:  73%|████████████████████████████████████████████████████▎                   | 7.26G/9.98G [03:03<00:58, 46.8MB/s][A
Downloading (…)of-00002.safetensors:  73%|████████████████████████████

Downloading (…)of-00002.safetensors:  78%|███████████████████████████████████████████████████████▉                | 7.75G/9.98G [03:15<00:48, 46.3MB/s][A
Downloading (…)of-00002.safetensors:  78%|███████████████████████████████████████████████████████▉                | 7.76G/9.98G [03:16<00:54, 40.9MB/s][A
Downloading (…)of-00002.safetensors:  78%|████████████████████████████████████████████████████████                | 7.77G/9.98G [03:16<01:12, 30.6MB/s][A
Downloading (…)of-00002.safetensors:  78%|████████████████████████████████████████████████████████▏               | 7.78G/9.98G [03:17<01:02, 35.3MB/s][A
Downloading (…)of-00002.safetensors:  78%|████████████████████████████████████████████████████████▏               | 7.79G/9.98G [03:17<01:05, 33.4MB/s][A
Downloading (…)of-00002.safetensors:  78%|████████████████████████████████████████████████████████▎               | 7.80G/9.98G [03:17<00:55, 39.4MB/s][A
Downloading (…)of-00002.safetensors:  78%|████████████████████████████

Downloading (…)of-00002.safetensors:  83%|███████████████████████████████████████████████████████████▊            | 8.29G/9.98G [03:30<00:32, 51.7MB/s][A
Downloading (…)of-00002.safetensors:  83%|███████████████████████████████████████████████████████████▉            | 8.30G/9.98G [03:30<00:39, 42.4MB/s][A
Downloading (…)of-00002.safetensors:  83%|████████████████████████████████████████████████████████████            | 8.32G/9.98G [03:31<00:33, 50.3MB/s][A
Downloading (…)of-00002.safetensors:  83%|████████████████████████████████████████████████████████████            | 8.33G/9.98G [03:31<00:40, 40.9MB/s][A
Downloading (…)of-00002.safetensors:  84%|████████████████████████████████████████████████████████████▏           | 8.35G/9.98G [03:31<00:30, 53.6MB/s][A
Downloading (…)of-00002.safetensors:  84%|████████████████████████████████████████████████████████████▎           | 8.36G/9.98G [03:31<00:33, 47.7MB/s][A
Downloading (…)of-00002.safetensors:  84%|████████████████████████████

Downloading (…)of-00002.safetensors:  89%|████████████████████████████████████████████████████████████████        | 8.87G/9.98G [03:43<00:22, 49.9MB/s][A
Downloading (…)of-00002.safetensors:  89%|████████████████████████████████████████████████████████████████        | 8.88G/9.98G [03:44<00:26, 41.3MB/s][A
Downloading (…)of-00002.safetensors:  89%|████████████████████████████████████████████████████████████████▏       | 8.89G/9.98G [03:44<00:21, 49.7MB/s][A
Downloading (…)of-00002.safetensors:  89%|████████████████████████████████████████████████████████████████▏       | 8.90G/9.98G [03:44<00:18, 57.3MB/s][A
Downloading (…)of-00002.safetensors:  89%|████████████████████████████████████████████████████████████████▎       | 8.91G/9.98G [03:44<00:24, 44.2MB/s][A
Downloading (…)of-00002.safetensors:  89%|████████████████████████████████████████████████████████████████▍       | 8.92G/9.98G [03:44<00:19, 52.7MB/s][A
Downloading (…)of-00002.safetensors:  90%|████████████████████████████

Downloading (…)of-00002.safetensors:  95%|████████████████████████████████████████████████████████████████████    | 9.44G/9.98G [03:57<00:12, 42.1MB/s][A
Downloading (…)of-00002.safetensors:  95%|████████████████████████████████████████████████████████████████████▏   | 9.45G/9.98G [03:57<00:10, 50.0MB/s][A
Downloading (…)of-00002.safetensors:  95%|████████████████████████████████████████████████████████████████████▎   | 9.46G/9.98G [03:57<00:12, 40.4MB/s][A
Downloading (…)of-00002.safetensors:  95%|████████████████████████████████████████████████████████████████████▎   | 9.47G/9.98G [03:57<00:10, 47.2MB/s][A
Downloading (…)of-00002.safetensors:  95%|████████████████████████████████████████████████████████████████████▍   | 9.48G/9.98G [03:58<00:11, 44.9MB/s][A
Downloading (…)of-00002.safetensors:  95%|████████████████████████████████████████████████████████████████████▍   | 9.49G/9.98G [03:58<00:10, 46.9MB/s][A
Downloading (…)of-00002.safetensors:  95%|████████████████████████████

Downloading (…)of-00002.safetensors:   1%|▍                                                                       | 21.0M/3.50G [00:00<02:05, 27.7MB/s][A
Downloading (…)of-00002.safetensors:   1%|▋                                                                       | 31.5M/3.50G [00:00<01:24, 41.2MB/s][A
Downloading (…)of-00002.safetensors:   1%|▊                                                                       | 41.9M/3.50G [00:01<01:37, 35.5MB/s][A
Downloading (…)of-00002.safetensors:   1%|█                                                                       | 52.4M/3.50G [00:01<01:17, 44.4MB/s][A
Downloading (…)of-00002.safetensors:   2%|█▎                                                                      | 62.9M/3.50G [00:01<01:07, 50.6MB/s][A
Downloading (…)of-00002.safetensors:   2%|█▌                                                                      | 73.4M/3.50G [00:01<01:22, 41.7MB/s][A
Downloading (…)of-00002.safetensors:   2%|█▋                          

Downloading (…)of-00002.safetensors:  16%|████████████                                                             | 577M/3.50G [00:15<01:47, 27.1MB/s][A
Downloading (…)of-00002.safetensors:  17%|████████████▏                                                            | 587M/3.50G [00:15<01:24, 34.4MB/s][A
Downloading (…)of-00002.safetensors:  17%|████████████▍                                                            | 598M/3.50G [00:15<01:34, 30.6MB/s][A
Downloading (…)of-00002.safetensors:  17%|████████████▋                                                            | 608M/3.50G [00:16<01:16, 38.0MB/s][A
Downloading (…)of-00002.safetensors:  18%|████████████▉                                                            | 619M/3.50G [00:16<01:26, 33.4MB/s][A
Downloading (…)of-00002.safetensors:  18%|█████████████                                                            | 629M/3.50G [00:16<01:09, 41.3MB/s][A
Downloading (…)of-00002.safetensors:  18%|█████████████▎              

Downloading (…)of-00002.safetensors:  32%|███████████████████████▎                                                | 1.13G/3.50G [00:28<01:01, 38.2MB/s][A
Downloading (…)of-00002.safetensors:  33%|███████████████████████▌                                                | 1.14G/3.50G [00:28<00:52, 44.8MB/s][A
Downloading (…)of-00002.safetensors:  33%|███████████████████████▋                                                | 1.15G/3.50G [00:29<01:27, 26.9MB/s][A
Downloading (…)of-00002.safetensors:  33%|███████████████████████▉                                                | 1.16G/3.50G [00:29<01:20, 28.9MB/s][A
Downloading (…)of-00002.safetensors:  34%|████████████████████████▏                                               | 1.17G/3.50G [00:30<01:23, 27.8MB/s][A
Downloading (…)of-00002.safetensors:  34%|████████████████████████▎                                               | 1.18G/3.50G [00:30<01:13, 31.6MB/s][A
Downloading (…)of-00002.safetensors:  34%|████████████████████████▌   

Downloading (…)of-00002.safetensors:  48%|██████████████████████████████████▌                                     | 1.68G/3.50G [00:43<00:32, 56.0MB/s][A
Downloading (…)of-00002.safetensors:  48%|██████████████████████████████████▋                                     | 1.69G/3.50G [00:43<00:42, 43.0MB/s][A
Downloading (…)of-00002.safetensors:  49%|██████████████████████████████████▉                                     | 1.70G/3.50G [00:43<00:34, 51.7MB/s][A
Downloading (…)of-00002.safetensors:  49%|███████████████████████████████████▏                                    | 1.71G/3.50G [00:44<00:37, 47.4MB/s][A
Downloading (…)of-00002.safetensors:  49%|███████████████████████████████████▎                                    | 1.72G/3.50G [00:44<00:36, 49.2MB/s][A
Downloading (…)of-00002.safetensors:  49%|███████████████████████████████████▌                                    | 1.73G/3.50G [00:44<00:46, 37.7MB/s][A
Downloading (…)of-00002.safetensors:  50%|████████████████████████████

Downloading (…)of-00002.safetensors:  64%|██████████████████████████████████████████████▎                         | 2.25G/3.50G [00:59<00:29, 41.9MB/s][A
Downloading (…)of-00002.safetensors:  65%|██████████████████████████████████████████████▌                         | 2.26G/3.50G [00:59<00:34, 36.0MB/s][A
Downloading (…)of-00002.safetensors:  65%|██████████████████████████████████████████████▊                         | 2.28G/3.50G [00:59<00:28, 42.3MB/s][A
Downloading (…)of-00002.safetensors:  65%|███████████████████████████████████████████████                         | 2.29G/3.50G [00:59<00:30, 40.0MB/s][A
Downloading (…)of-00002.safetensors:  66%|███████████████████████████████████████████████▏                        | 2.30G/3.50G [01:00<00:25, 48.0MB/s][A
Downloading (…)of-00002.safetensors:  66%|███████████████████████████████████████████████▍                        | 2.31G/3.50G [01:00<00:30, 38.9MB/s][A
Downloading (…)of-00002.safetensors:  66%|████████████████████████████

Downloading (…)of-00002.safetensors:  80%|█████████████████████████████████████████████████████████▌              | 2.80G/3.50G [01:12<00:19, 35.4MB/s][A
Downloading (…)of-00002.safetensors:  80%|█████████████████████████████████████████████████████████▊              | 2.81G/3.50G [01:13<00:23, 29.5MB/s][A
Downloading (…)of-00002.safetensors:  81%|██████████████████████████████████████████████████████████              | 2.82G/3.50G [01:13<00:18, 37.0MB/s][A
Downloading (…)of-00002.safetensors:  81%|██████████████████████████████████████████████████████████▏             | 2.83G/3.50G [01:13<00:15, 42.8MB/s][A
Downloading (…)of-00002.safetensors:  81%|██████████████████████████████████████████████████████████▍             | 2.84G/3.50G [01:13<00:16, 40.7MB/s][A
Downloading (…)of-00002.safetensors:  81%|██████████████████████████████████████████████████████████▋             | 2.85G/3.50G [01:13<00:13, 49.5MB/s][A
Downloading (…)of-00002.safetensors:  82%|████████████████████████████

Downloading (…)of-00002.safetensors:  96%|█████████████████████████████████████████████████████████████████████▏  | 3.37G/3.50G [01:27<00:03, 36.4MB/s][A
Downloading (…)of-00002.safetensors:  96%|█████████████████████████████████████████████████████████████████████▍  | 3.38G/3.50G [01:27<00:02, 43.2MB/s][A
Downloading (…)of-00002.safetensors:  97%|█████████████████████████████████████████████████████████████████████▋  | 3.39G/3.50G [01:28<00:03, 36.8MB/s][A
Downloading (…)of-00002.safetensors:  97%|█████████████████████████████████████████████████████████████████████▉  | 3.40G/3.50G [01:28<00:02, 40.2MB/s][A
Downloading (…)of-00002.safetensors:  97%|██████████████████████████████████████████████████████████████████████  | 3.41G/3.50G [01:28<00:02, 36.4MB/s][A
Downloading (…)of-00002.safetensors:  98%|██████████████████████████████████████████████████████████████████████▎ | 3.42G/3.50G [01:28<00:01, 44.0MB/s][A
Downloading (…)of-00002.safetensors:  98%|████████████████████████████

In [6]:
model = model.eval()
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [7]:
from transformers import pipeline

tokenized_prompt = tokenizer(prompt)

print(f'prompt is {len(tokenized_prompt["input_ids"][0])} tokens')

prompt is 41 tokens


In [8]:
from transformers import GenerationConfig

generation_config = GenerationConfig(max_new_tokens=2000)

pipeline = pipeline("text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    generation_config=generation_config)

In [None]:
pipeline(prompt)