In [18]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
from enum import Enum


In [19]:
class HuggingFaceModel(Enum):
    Zephyr = "HuggingFaceH4/zephyr-7b-beta"
    Yi = "01-ai/Yi-34B"
    Mistral = "mistralai/Mistral-7B-v0.1"
    FalconChat = "TheBloke/Falcon-180B-Chat-GGUF"
    FalconRaw = "tiiuae/falcon-180B"

In [None]:
# model = AutoModelForCausalLM.from_pretrained(HuggingFaceModel.FalconChat.value, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(HuggingFaceModel.FalconRaw.value, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(HuggingFaceModel.FalconRaw.value)
inputs = tokenizer("I'm bored and looking for some fun things to do in new york city with beautiful scenery. Any suggestions?", return_tensors="pt").to("mps")
max_length = 256

output = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    max_new_tokens=50,
)
output = output[0].to("cpu")
print(tokenizer.decode(output))

# outputs = model.generate(
#     inputs.input_ids,
#     max_length=max_length,
#     eos_token_id=tokenizer.eos_token_id,
#     do_sample=True,
#     repetition_penalty=1.3,
#     no_repeat_ngram_size=5,
#     temperature=0.7,
#     top_k=40,
#     top_p=0.8,
# )
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Downloading shards:   0%|          | 0/81 [00:00<?, ?it/s]

Downloading (…)of-00081.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

Downloading (…)of-00081.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

Downloading (…)of-00081.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

Downloading (…)of-00081.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

Downloading (…)of-00081.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

Downloading (…)of-00081.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

Downloading (…)of-00081.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

Downloading (…)of-00081.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

Downloading (…)of-00081.safetensors:   0%|          | 0.00/4.44G [00:00<?, ?B/s]

In [None]:
def get_mistral_model():
    return pipeline("text-generation", model=HuggingFaceModel.Mistral.value, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)

def get_zephyr_model():
    return pipeline("text-generation", model=HuggingFaceModel.Zephyr.value, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)


def run_llm(pipe, context, msg, split=''):
    # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
    messages = [
        {
            "role": "system",
            "content": context,
        },
        {"role": "user", "content": msg},
    ]
    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    response = outputs[0]["generated_text"]
    if len(split) > 0:
        return response.split(split)[1]
    return response

In [None]:
%%time
zephyr = get_zephyr_model()
mistral = get_mistral_model()

In [None]:
# %%time
context = "You are a friendly, helpful chatbot who always responds in the style of a personal assistant to help your user be more productive"
msg = "I'm bored and looking for some fun things to do in new york city with beautiful scenery. Any suggestions?"
split = 'assistant|>\n'
# response = run_llm(model, context, msg)
# print(response)

In [None]:
%%time
run_llm(zephyr, context, msg)

In [None]:
%%time
run_llm(mistral, context, msg)

In [None]:
def run_yi_llm():
    # CODE DOESN'T WORK ON MPS - Yi might require cuda
    model = AutoModelForCausalLM.from_pretrained("01-ai/Yi-34B", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-34B", trust_remote_code=True, torch_dtype=torch.float16)
    inputs = tokenizer("There's a place where time stands still. A place of breath taking wonder, but also", return_tensors="pt")
    max_length = 256
    
    outputs = model.generate(
        inputs.input_ids.to("mps"),
        max_length=max_length,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        repetition_penalty=1.3,
        no_repeat_ngram_size=5,
        temperature=0.7,
        top_k=40,
        top_p=0.8,
    )
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))