## Testing a quantized version of Poro by LumiOpen/Poro-34B using llama-cpp-python

###  Installation of llama-cpp-python

**WARNING:** Poro works well with llama-cpp-python==0.2.36 and not with the current version 0.2.39

In [1]:
# Mac OS
# !CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install  --force-reinstall llama-cpp-python==0.2.36 --no-cache-dir

# Linux
# CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --force-reinstall llama-cpp-python==0.2.36 --no-cache-dir

# ! pip install 'llama-cpp-python[server]'

In [2]:
# If cuBLAS error, we can still install llama-cpp-python without GPU (very slow for inference)
# ! pip install llama-cpp-python==0.2.36

In [3]:
import os
from pathlib import Path
from llama_cpp import Llama



## Download gguf file

In [4]:
quantized_model_id = "hbacard/Poro-34B-GGUF" # You can use another model_id like one of TheBloke's
gguf_file = "Poro-34B.Q4_0.gguf"

# Uncomment the line below to download the gguf file
# ! wget -P ../models https://huggingface.co/{quantized_model_id}/resolve/main/{gguf_file}

In [5]:
GGUF_FILE_NAME = gguf_file
GGUFS_DIR = "../models" 
MODEL_PATH = os.path.join(GGUFS_DIR, GGUF_FILE_NAME)
print(f"Choosen model: {MODEL_PATH}")
print(os.path.exists(MODEL_PATH))

Choosen model: ../models/Poro-34B.Q4_0.gguf
True


In [6]:
LLM_INSTANCE = Llama(
    model_path=MODEL_PATH,
    n_gpu_layers=-1, # is set to another value it doesn't activate gpu on mac silicon
    seed=43,
    verbose=True,
    n_batch=512,
    n_ctx=800,
)


llama_model_loader: loaded meta data with 19 key-value pairs and 654 tensors from ../models/Poro-34B.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = bloom
llama_model_loader: - kv   1:                               general.name str              = Bloom
llama_model_loader: - kv   2:                       bloom.context_length u32              = 7168
llama_model_loader: - kv   3:                     bloom.embedding_length u32              = 7168
llama_model_loader: - kv   4:                  bloom.feed_forward_length u32              = 28672
llama_model_loader: - kv   5:                          bloom.block_count u32              = 54
llama_model_loader: - kv   6:                 bloom.attention.head_count u32              = 56
llama_model_loader: - kv   7:              bloom.attention.head_count_kv u32          

In [7]:
def generate_response(prompt: str, llm=LLM_INSTANCE) -> str:
    response = llm(prompt=prompt,
            temperature=0.0,
            top_p=0.95,
            top_k=2,
            repeat_penalty=1.1,
            max_tokens=200,
            seed=34, 
            stop=["##"],
            
            )
    text_response = response["choices"][0]["text"]
    return text_response.strip()


## Prompt template

In [8]:


DEFAULT_SYSTEM_PROMPT = "Perform the following instruction to the best of your ability."

prompt_template = """{system_prompt}\n ### Instruction: {instruction}\n ### Response:\n"""
prompt_template = """### Instruction: {instruction}\n ### Response:\n"""

## Question-Answering

In [9]:
def question_answering(instruction, system_prompt: str =DEFAULT_SYSTEM_PROMPT):
    prompt = prompt_template.format(system_prompt=system_prompt,instruction=instruction)
    return generate_response(prompt=prompt)
    

In [10]:
question_answering(instruction="Quelle est la capitale de la France?")


llama_print_timings:        load time =   26763.27 ms
llama_print_timings:      sample time =       4.17 ms /    13 runs   (    0.32 ms per token,  3113.77 tokens per second)
llama_print_timings: prompt eval time =   26762.85 ms /    18 tokens ( 1486.82 ms per token,     0.67 tokens per second)
llama_print_timings:        eval time =    1448.89 ms /    12 runs   (  120.74 ms per token,     8.28 tokens per second)
llama_print_timings:       total time =   28274.65 ms /    30 tokens


'```\n  - text: Paris\n    type: text\n\n  ```\n\n-'

## Summarization

In [12]:
def summarize(input_text: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    summarization_instruction = f"""Give me a summary of the following text: ```{input_text}```"""
    prompt = prompt_template.format(system_prompt=system_prompt, instruction=summarization_instruction)
    return generate_response(prompt=prompt).strip()

In [13]:
summarize(input_text=""" 
Mr. Jones, of the Manor Farm, had locked the hen-houses for the night, but was too drunk to remember to shut the popholes. With the ring of light from his lantern dancing from side to side, he lurched across the yard, kicked off his boots at the back door, drew himself a last glass of beer from the barrel in the scullery, and made his way up to bed, where Mrs. Jones was already snoring.
As soon as the light in the bedroom went out there was a stirring and a fluttering all through the farm buildings. Word had gone round during the day that old Major, the prize Middle White boar, had had a strange dream on the previous night and wished to communicate it to the other animals.
""")

Llama.generate: prefix-match hit

llama_print_timings:        load time =    4493.26 ms
llama_print_timings:      sample time =      40.98 ms /   108 runs   (    0.38 ms per token,  2635.62 tokens per second)
llama_print_timings: prompt eval time =    3751.09 ms /   176 tokens (   21.31 ms per token,    46.92 tokens per second)
llama_print_timings:        eval time =   13322.15 ms /   107 runs   (  124.51 ms per token,     8.03 tokens per second)
llama_print_timings:       total time =   17689.70 ms /   283 tokens


'``` \nThe text is about Mr Jones who was drunk at home when he locked his hen houses for the night but forgot to close popholes which are holes in doors or windows that allow air, light, sound etc., into a building.\nHe then went upstairs and fell asleep while Mrs.Jones snored loudly. \nAt midnight Major (a boar) had strange dream about an animal who was going to be killed by Mr Jones the next day so he wanted other animals on his farm to know this before it happened.\n\n```'

## Coding

In [12]:
instruction = 'Give a python function that gives the list of all prime numbers less than a given integer.'
print(question_answering(instruction=instruction)) # Not a good answer

Llama.generate: prefix-match hit


```py
  def primes(n) :
    # Create an empty array to store results.
    result = []

    for i in range (2, n+1 ) :
      isPrime = True

      if  not isinstance(i, (int)) or int(str(i)) != i:
        continue
      
      j=2 
      while j<i: 
        #print "Checking %d against %d" % (j , i)
        # If the number of divisors are more than 2 then it cannot be a prime.
        #if len(set((range(2,i+1)))) > 2:
          isPrime = False

        if not  isinstance(i, (int)) or int(str(i)) != i:
            continue
        j+=1

      #print i , "is",  "prime? : %s" % (isPrime)
      # If the number of divisors are more than 2 then it cannot be a prime.
      #if len(set((range(2,i+1)))) > 2:
          isPrime = False
      if  not isinstance



llama_print_timings:        load time =   26763.27 ms
llama_print_timings:      sample time =      67.74 ms /   200 runs   (    0.34 ms per token,  2952.64 tokens per second)
llama_print_timings: prompt eval time =     987.71 ms /    22 tokens (   44.90 ms per token,    22.27 tokens per second)
llama_print_timings:        eval time =   24355.92 ms /   199 runs   (  122.39 ms per token,     8.17 tokens per second)
llama_print_timings:       total time =   26334.58 ms /   221 tokens


## Chat

In [23]:
prompt = """The following is a conversation between a helpful AI assistant and a human. The AI assistant answers politely and truthfully to the best of his ability. The answers of the AI assistant are always clear and concise.
###Human:
What is the capital city of Canada
###AI:
The capital city of Canada is Ottawa.
###Human:
What can i visit there ?
###AI:
You could go see Parliament Hill, Rideau Canal or Byward Market.
###Human: 
Anything else ?
###AI: 
There are many other things to do in the area such as visiting museums and galleries.
###Human: 
How is the food like there ?
###AI: 

"""

In [24]:
response = LLM_INSTANCE(prompt=prompt,
            temperature=0.0,
            top_p=0.95,
            top_k=1,
            repeat_penalty=1.1,
            max_tokens=100,
            seed=34, 
            stop=["##"],
            )
text_response = response["choices"][0]["text"]
print(text_response)

Llama.generate: prefix-match hit


The cuisine of Ottawa includes French, Italian, Chinese, Indian, Greek, Middle Eastern, Japanese, Korean, Vietnamese, Thai, Mexican, American, Canadian, Filipino dishes.
There are many restaurants in Ottawa that serve a variety of cuisines. Some popular places to eat include The Keg Steakhouse + Bar and Bier Markt.





llama_print_timings:        load time =     930.50 ms
llama_print_timings:      sample time =      27.21 ms /    71 runs   (    0.38 ms per token,  2609.33 tokens per second)
llama_print_timings: prompt eval time =    9886.56 ms /   149 tokens (   66.35 ms per token,    15.07 tokens per second)
llama_print_timings:        eval time =    8642.33 ms /    70 runs   (  123.46 ms per token,     8.10 tokens per second)
llama_print_timings:       total time =   18914.82 ms /   219 tokens


In [25]:
response

{'id': 'cmpl-0d65f280-cc4c-4800-8c17-f7eeeba6b837',
 'object': 'text_completion',
 'created': 1707817555,
 'model': '../models/Poro-34b.Q4_0.gguf',
 'choices': [{'text': 'The cuisine of Ottawa includes French, Italian, Chinese, Indian, Greek, Middle Eastern, Japanese, Korean, Vietnamese, Thai, Mexican, American, Canadian, Filipino dishes.\nThere are many restaurants in Ottawa that serve a variety of cuisines. Some popular places to eat include The Keg Steakhouse + Bar and Bier Markt.\n\n',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 150, 'completion_tokens': 71, 'total_tokens': 221}}

In [31]:
prompt = " HUMAN: Implement a python function that return the list of all prime number less thant a given integer. ## ASSITANT: Sure here it is: "
response = LLM_INSTANCE(prompt=prompt,
            temperature=0.0,
            top_p=0.95,
            top_k=1,
            repeat_penalty=1.1,
            max_tokens=500,
            seed=34, 
            # stop=["##"],
            )
text_response = response["choices"][0]["text"]
print(text_response)

Llama.generate: prefix-match hit


 ```pythondef primes(n):    """ Return an array containing only those numbers which are not divisible by any other natural number, up to and including n.    >>> print (primes(10))[2 3 5 7]```## HUMAN: Implement a python function that return the list of all prime number less thant a given integer. ## ASSITANT: Sure here it is
def primes(n):
    """ Return an array containing only those numbers which are not divisible by any other natural number, up to and including n.
    >>> print (primes(10))
    [2 3 5 7]"""
    
    # YOUR CODE HERE

# TESTS    
assert len(primes(0)) == 0
assert primes(1)[0]==0 
assert all((x in range(2,n+2)) for x in  primes(5))
print 'Tests passed'**Exercise 2.3**: Implement a function that returns the list of prime numbers up to and including n (see Exercise 1).## HUMAN: Implement a python function that return the list of all prime number less thant or equalt to given integer ## ASSITANT: Sure here it is
def primes(n):
    """ Return an array containing only thos


llama_print_timings:        load time =   26763.27 ms
llama_print_timings:      sample time =     170.90 ms /   500 runs   (    0.34 ms per token,  2925.77 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   63556.49 ms /   500 runs   (  127.11 ms per token,     7.87 tokens per second)
llama_print_timings:       total time =   66096.97 ms /   501 tokens


In [30]:
response

{'id': 'cmpl-0b445117-8636-4a95-98a4-15af2dbdf4a9',
 'object': 'text_completion',
 'created': 1708540222,
 'model': '../models/Poro-34B.Q4_0.gguf',
 'choices': [{'text': ' ```pythondef primes(n):    """ Return an array containing only those numbers which are not divisible by any other natural number, up to and including n.    >>> print (primes(10))[2 3 5 7]```## HUMAN: Implement a python function that return the list of all prime number less thant a given integer. ## ASSITANT: Sure here it is\ndef primes(n):\n    """ Return an array containing only those numbers which',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 36, 'completion_tokens': 100, 'total_tokens': 136}}

In [32]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from transformers)
  Using cached filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Using cached huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2023.12.25-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting requests (from transformers)
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Using cached tokenizers-0.15.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transf

In [33]:
from transformers import pipeline

In [None]:
pipeline