In [None]:
from langchain.llms.llamacpp import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Metal set to 1 is enough.
n_gpu_layers = 1
# Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
n_batch = 512
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="/Users/gensliu/.cache/huggingface/hub/models--TheBloke--Yi-34B-Chat-GGUF/snapshots/ee827f906e733381ab65c650dcfdd2caf6de8762/yi-34b-chat.Q4_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    stop=["<|im_end|>"],
    max_tokens=8192,
    # callback_manager=callback_manager,
    # verbose=True,  # Verbose is required to pass to the callback manager
)

prompt = PromptTemplate(template="""\
<|im_start|>user
{usrmsg}<|im_end|>
<|im_start|>assistant
""", input_variables=["usrmsg"])

while True:
    in_txt = input(">>> ")
    print("<<< ", end="", flush=True)
    data = prompt.format(usrmsg=in_txt)
    for chunk in llm.stream(input=data):
        print(chunk, end="", flush=True)
    print()

In [11]:
from llama_cpp import Llama

Yi_34B_Chat_GGUF = '/Users/gensliu/.cache/huggingface/hub/models--TheBloke--Yi-34B-Chat-GGUF/snapshots/ee827f906e733381ab65c650dcfdd2caf6de8762/yi-34b-chat.Q4_K_M.gguf'

llm = Llama(
    model_path=Yi_34B_Chat_GGUF,
    n_gpu_layers=1,
    n_batch=512,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    chat_format="chatml",
    # callback_manager=callback_manager,
    echo=True,
    verbose=True,  # Verbose is required to pass to the callback manager
)

llm.tokenizer().encode

while True:
    in_txt = input(">>> ")
    print("<<< ", end="", flush=True)
    rep = llm.create_chat_completion(
        messages=[
            {"role": "user", "content": in_txt},
        ],
        stream=True,
        # stop=["<|im_end|>"],
        max_tokens=8192,
    )
    for chunk in rep:
        msg = chunk["choices"][0]["delta"].get("content", "")
        print(msg, end="", flush=True)
    print()

llama_model_loader: loaded meta data with 23 key-value pairs and 543 tensors from /Users/gensliu/.cache/huggingface/hub/models--TheBloke--Yi-34B-Chat-GGUF/snapshots/ee827f906e733381ab65c650dcfdd2caf6de8762/yi-34b-chat.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  7168, 64000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  7168,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 20480,  7168,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  7168, 20480,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  7168, 20480,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  7168,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  7168,  1024,   

<<< "烟的尽头终是灰，人的尽头终是悲"这句话表达了一种对生命和存在的悲观看法。它通过比喻烟草燃烧后最终会化为灰烬，来类比人类的命运，即无论人生旅途多么辉煌灿烂，最终都会走向死亡和消逝，就像烟雾一样归于无形。

从哲学角度来看，这句话反映了存在主义的一个观点，即生命的本质是荒谬的，因为所有的生命最终都会结束，而死亡是不可避免的终点。这种观念强调了生命的有限性和脆弱性，以及人类在面对这一事实时的无力感。

然而，对于这个观点，也有人持不同意见。一些人认为，虽然死亡是每个人的最终归宿，但生命的价值和意义并不在于它的长度或结局，而是取决于我们如何生活、如何与他人建立联系以及在世界上留下怎样的印记。因此，尽管"烟的尽头终是灰，人的尽头终是悲"这句话可能让人感到沮丧，但它也可以激励人们珍惜生命中的每一刻，追求有意义的生活，并在面对死亡时保持一种更加积极和乐观的态度。


llama_print_timings:        load time =   29880.92 ms





llama_print_timings:      sample time =      39.23 ms /   222 runs   (    0.18 ms per token,  5658.21 tokens per second)
llama_print_timings: prompt eval time =   29880.55 ms /    33 tokens (  905.47 ms per token,     1.10 tokens per second)
llama_print_timings:        eval time =   15512.82 ms /   221 runs   (   70.19 ms per token,    14.25 tokens per second)
llama_print_timings:       total time =   46440.67 ms


KeyboardInterrupt: Interrupted by user

```yaml

llm:
    yichat:
        type: llama
        path: /llama/yichat
        advance:
            verbose: true

embedding:
    luotuo:
        type: luotuo-embeding

mem:
    chroma:
        type: chroma
        path: /memcached/chroma
    history:
        type: memcached
        path: /memcached/history

giftia:
    llm: yichat
    embedding: embedding
    mem: mem

```