In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
import os

user_path = os.path.expanduser("~")

In [3]:
model_path = os.path.join(
    user_path, "llm-models", "cpp", "llama-2-13b-chat-ggml-q4_0.gguf"
)

In [4]:
template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [5]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

## CPU

In [6]:
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    callback_manager=callback_manager,
    verbose=True,
)

ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2080 Ti, compute capability 7.5
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from /home/ldf/llm-models/cpp/llama-2-13b-chat-ggml-q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_0     [  512

In [7]:
prompt = """
Question: A rap battle between Stephen Colbert and John Oliver
"""
llm(prompt)


The scene is set in a dimly lit underground nightclub, where the air is thick with anticipation. The crowd is rowdy and excited, eagerly awaiting the main event: a rap battle between two of the biggest names in comedy, Stephen Colbert and John Oliver.

As the MC introduces the competitors, the tension in the room reaches a fever pitch. Stephen Colbert, dressed in his signature suit and tie, struts confidently onto the stage, mic in hand. John Oliver, sporting a sleek black leather jacket and a mischievous grin, follows close behind.

The battle begins, with each rapper taking turns spitting bars at each other. Stephen Colbert kicks things off with a barrage of witty one-liners and clever wordplay:

"I'm the king of comedy, the prince of satire,
My jokes are so sharp, they'll leave you impaled."

John Oliver retaliates with a series of biting disses and slick rhymes:

"You may have a suit and a tie, but I've got style and grace,
My jabs are


llama_print_timings:        load time =   932.48 ms
llama_print_timings:      sample time =   120.10 ms /   256 runs   (    0.47 ms per token,  2131.52 tokens per second)
llama_print_timings: prompt eval time =  1858.45 ms /    16 tokens (  116.15 ms per token,     8.61 tokens per second)
llama_print_timings:        eval time = 79934.70 ms /   255 runs   (  313.47 ms per token,     3.19 tokens per second)
llama_print_timings:       total time = 82537.04 ms


'\nThe scene is set in a dimly lit underground nightclub, where the air is thick with anticipation. The crowd is rowdy and excited, eagerly awaiting the main event: a rap battle between two of the biggest names in comedy, Stephen Colbert and John Oliver.\n\nAs the MC introduces the competitors, the tension in the room reaches a fever pitch. Stephen Colbert, dressed in his signature suit and tie, struts confidently onto the stage, mic in hand. John Oliver, sporting a sleek black leather jacket and a mischievous grin, follows close behind.\n\nThe battle begins, with each rapper taking turns spitting bars at each other. Stephen Colbert kicks things off with a barrage of witty one-liners and clever wordplay:\n\n"I\'m the king of comedy, the prince of satire,\nMy jokes are so sharp, they\'ll leave you impaled."\n\nJohn Oliver retaliates with a series of biting disses and slick rhymes:\n\n"You may have a suit and a tie, but I\'ve got style and grace,\nMy jabs are'

## GPU

In [8]:
n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    verbose=True,
)

llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from /home/ldf/llm-models/cpp/llama-2-13b-chat-ggml-q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_gate.weight q4_0     [

In [9]:
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [10]:
question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"

llm_chain.run(question)



Step 1: Justin Bieber was born on March 1, 1994.

Step 2: The Super Bowl is played at the end of each NFL season, usually in early February.

Step 3: Since Justin Bieber was born in March and the Super Bowl is played in February, we know that the Super Bowl did not happen in the same year that Justin Bieber was born.

So, there is no NFL team that won the Super Bowl in the year Justin Bieber was born (1994).


llama_print_timings:        load time =  1456.77 ms
llama_print_timings:      sample time =    54.64 ms /   122 runs   (    0.45 ms per token,  2232.92 tokens per second)
llama_print_timings: prompt eval time =  1456.70 ms /    45 tokens (   32.37 ms per token,    30.89 tokens per second)
llama_print_timings:        eval time =  5893.48 ms /   121 runs   (   48.71 ms per token,    20.53 tokens per second)
llama_print_timings:       total time =  7689.81 ms


'\n\nStep 1: Justin Bieber was born on March 1, 1994.\n\nStep 2: The Super Bowl is played at the end of each NFL season, usually in early February.\n\nStep 3: Since Justin Bieber was born in March and the Super Bowl is played in February, we know that the Super Bowl did not happen in the same year that Justin Bieber was born.\n\nSo, there is no NFL team that won the Super Bowl in the year Justin Bieber was born (1994).'