In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
import os

user_path = os.path.expanduser("~")

In [3]:
model_path = os.path.join(user_path, "llm-models", "llama-2-7b-chat.ggmlv3.q8_0.bin")

In [4]:
template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [5]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

## CPU

In [6]:
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    input={"temperature": 0.75, "max_length": 2000, "top_p": 1},
    callback_manager=callback_manager,
    verbose=True,
)

ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2080 Ti, compute capability 7.5
llama.cpp: loading model from /home/ldf/llm-models/llama-2-7b-chat.ggmlv3.q8_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 7 (mostly Q8_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal:

In [7]:
prompt = """
Question: A rap battle between Stephen Colbert and John Oliver
"""
llm(prompt)


(Note: This is purely fictional, for entertainment purposes only. Both Stephen Colbert and John Oliver are real people with their own actual rap skills, but for the sake of this scenario, let's assume they have magically transformed into rap superstars.)

[Scene opens with Stephen Colbert standing on stage, mic in hand, dressed in a sleek black suit and sunglasses. The crowd is going wild.]
Stephen Colbert: Yo, what's good y'all? It's your boy S-C, the king of late night!
[John Oliver emerges from backstage, dressed in a sharp gray suit and shades, smirking at Stephen.]
John Oliver: Oh, you think you're the king of late night? Please. I'm the real MVP, the one they all come to see.
[The crowd cheers as the two comedians launch into their raps, trading bars and insults like pros.]
Stephen Colbert: You may have a show, but I've got the flow. My rhymes are tighter than your suit, John!
John Oliver: Talk to the hand,


llama_print_timings:        load time =  1932.49 ms
llama_print_timings:      sample time =   115.82 ms /   256 runs   (    0.45 ms per token,  2210.35 tokens per second)
llama_print_timings: prompt eval time =  3871.18 ms /    16 tokens (  241.95 ms per token,     4.13 tokens per second)
llama_print_timings:        eval time = 72962.58 ms /   255 runs   (  286.13 ms per token,     3.49 tokens per second)
llama_print_timings:       total time = 77571.67 ms


"\n(Note: This is purely fictional, for entertainment purposes only. Both Stephen Colbert and John Oliver are real people with their own actual rap skills, but for the sake of this scenario, let's assume they have magically transformed into rap superstars.)\n\n[Scene opens with Stephen Colbert standing on stage, mic in hand, dressed in a sleek black suit and sunglasses. The crowd is going wild.]\nStephen Colbert: Yo, what's good y'all? It's your boy S-C, the king of late night!\n[John Oliver emerges from backstage, dressed in a sharp gray suit and shades, smirking at Stephen.]\nJohn Oliver: Oh, you think you're the king of late night? Please. I'm the real MVP, the one they all come to see.\n[The crowd cheers as the two comedians launch into their raps, trading bars and insults like pros.]\nStephen Colbert: You may have a show, but I've got the flow. My rhymes are tighter than your suit, John!\nJohn Oliver: Talk to the hand,"

## GPU

In [8]:
n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    verbose=True,
)

llama.cpp: loading model from /home/ldf/llm-models/llama-2-7b-chat.ggmlv3.q8_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 7 (mostly Q8_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: using CUDA for GPU acceleration
llama_model_load_internal: mem required  =  434.90 MB (+  256.00 MB pe

In [9]:
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [10]:
question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"

llm_chain.run(question)

 
Justin Bieber was born on March 1, 1994.
The first Super Bowl took place in 1967 and since then there has been a Super Bowl every year except for 1970 and 1983.
So, the NFL team that won the Super Bowl in the year Justin Bieber was born (1994) is... the Pittsburgh Steelers!
That's right, the Steelers won Super Bowl XXXIX (the 39th edition of the Super Bowl) on February 6, 2005.


llama_print_timings:        load time =   388.33 ms
llama_print_timings:      sample time =    59.55 ms /   132 runs   (    0.45 ms per token,  2216.62 tokens per second)
llama_print_timings: prompt eval time =   388.26 ms /    45 tokens (    8.63 ms per token,   115.90 tokens per second)
llama_print_timings:        eval time =  3489.89 ms /   131 runs   (   26.64 ms per token,    37.54 tokens per second)
llama_print_timings:       total time =  4257.37 ms


" \nJustin Bieber was born on March 1, 1994.\nThe first Super Bowl took place in 1967 and since then there has been a Super Bowl every year except for 1970 and 1983.\nSo, the NFL team that won the Super Bowl in the year Justin Bieber was born (1994) is... the Pittsburgh Steelers!\nThat's right, the Steelers won Super Bowl XXXIX (the 39th edition of the Super Bowl) on February 6, 2005."