In [1]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from transformers import pipeline, AutoModelForCausalLM

In [2]:
# Install the model
model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
model_file = "mistral-7b-instruct-v0.2.Q2_K.gguf"
model_path = hf_hub_download(model_name, filename=model_file)

In [None]:
llm = Llama(
    model_path = model_path
)

In [None]:
input = "Q: Who is the president of the USA? A: "
output = llm(
    prompt = input,
    max_tokens = 32,
    stop = ["Q:", "\n"],
    echo=True
)
print(output)

In [None]:
input = "Q: Can you give my a recipe to make hibachi shrimp fried rice? A: "
output = llm(
    prompt = input,
    max_tokens = 100,
    stop = [],
    echo=False
)
print(output["choices"]) # this generated 100 tokens in about 15 seconds, which is pretty good

In [None]:
input = "Q: Can you give my a recipe to make hibachi shrimp fried rice? A: "
output = llm(
    prompt = input,
    max_tokens = 1000, # more tokens this time
    stop = ["</s>"],
    echo=False
)
print(output["choices"]) # this generated 489 tokens in about 90 seconds, meaning about 100 tokens every 18 seconds, still not too bad

In [None]:
print(model_path)

In [None]:
# Defining a larger model with more context size
llm_large = Llama(
    model_path = model_path,
    n_ctx=2048
)

In [None]:
# Testing to see what happens if I set the context size to be larger?
input = "Q: Can you give my a recipe to make hibachi shrimp fried rice? A: "
output = llm_large(
    prompt = input,
    max_tokens = 100,
    stop = [],
    echo=False
)
print(output["choices"]) # generates 100 tokens in 19.1 s, so still not bad for 2048 context size

In [None]:
# 4096 context size model
llm_4096 = Llama(
    model_path = model_path,
    n_ctx=4096
)

In [None]:
# Testing the speed
input = "Q: Can you give my a recipe to make hibachi shrimp fried rice? A: "
output = llm_4096(
    prompt = input,
    max_tokens = 100,
    stop = [],
    echo=False
)
print(output["choices"]) # generates 100 tokens in 16 seconds, even faster than the 2000 size; I think most of the differences are random, it seems the context size doesn't really make a difference

In [7]:
model = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.2-GGUF")
pipe = pipeline("text-generation", model)

OSError: TheBloke/Mistral-7B-Instruct-v0.2-GGUF does not appear to have a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack.