In [None]:
%pip install -qU petals xformers accelerate

In [1]:
import os

auth_token = os.environ["HUGGINGFACE_API_KEY"]
os.environ["TRANSFORMERS_CACHE"] = "/media/limcheekin/My Passport/transformers_cache"


In [3]:
%%time

from transformers import AutoTokenizer
from petals import AutoDistributedModelForCausalLM
import torch

model_name = "meta-llama/Llama-2-70b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(
    model_name, use_auth_token=auth_token)
# [WARN] Running the client with dtype bfloat16 on CPU may be slow, 
# since your CPU doesn't support AVX512. 
# Consider loading the model with torch_dtype='float32'
# REF: https://github.com/bigscience-workshop/petals/issues/321
model = AutoDistributedModelForCausalLM.from_pretrained(
    model_name, use_auth_token=auth_token, torch_dtype=torch.float32)

Jul 25 15:52:02.813 [[1m[34mINFO[0m] Make sure you follow the LLaMA's terms of use: https://bit.ly/llama2-license for LLaMA 2, https://bit.ly/llama-license for LLaMA 1
Jul 25 15:52:02.817 [[1m[34mINFO[0m] Using DHT prefix: Llama-2-70b-chat-hf
Loading checkpoint shards: 100%|██████████| 3/3 [00:32<00:00, 10.87s/it]


CPU times: user 3.3 s, sys: 2.57 s, total: 5.86 s
Wall time: 42.3 s


In [5]:
# REF: https://huggingface.co/spaces/huggingface-projects/llama-2-13b-chat/blob/main/model.py#L24
DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant.\nYou will try to answer user questions, but don't make up the answer if you don't have the answer.\nYou will complete tasks by following user instructions."
def get_prompt(message: str, chat_history: list[tuple[str, str]] = [],
               system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    texts = [f'[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n']
    for user_input, response in chat_history:
        texts.append(f'{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ')
    texts.append(f'{message.strip()} [/INST]')
    return ''.join(texts)

In [None]:
get_prompt("What is AI?")

In [6]:
# default torch.dtype='bfloat16', time=2m 21.6s
# torch_dtype=torch.float32, time=1m 29.3
# (torch.float32 faster but use 2x memory according to:
#  https://github.com/bigscience-workshop/petals/issues/321)

prompt = "What is AI?"
inputs = tokenizer(get_prompt(prompt), return_tensors="pt")["input_ids"]
outputs = model.generate(inputs, max_new_tokens=64)
print(tokenizer.decode(outputs[0]))

Jul 25 15:54:29.250 [[1m[34mINFO[0m] Route found: 0:40 via …uCst2v => 40:80 via …o6L7CF


<s> [INST] <<SYS>>
You are a helpful assistant.
You will try to answer user questions, but don't make up the answer if you don't have the answer.
You will complete tasks by following user instructions.
<</SYS>>

What is AI? [/INST]  AI stands for Artificial Intelligence. It refers to the ability of machines or computer programs to mimic intelligent human behavior, such as learning, problem-solving, and decision-making. AI systems use algorithms and data to make predictions, classify objects, and generate insights,


In [None]:
from transformers import pipeline

generate_text = pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    #stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)