# 2-agent local model conversation

This notebook uses `llama.cpp` to spawn LLM API endpoints for 2 different models, and `autogen` to put then to talk to each other. Models run entirely on the CPU.

- List of potentially interesting models: https://www.reddit.com/r/LocalLLaMA/comments/187739y/7b_models_ability_to_seduce_comparison/

Functions
- `download_model(user_model, filename):`: downloads a GGUF model from HF
- `run_server(model_path, port, n_threads)`: spawns a llama.cpp API endpoint
- `single_message_chat(message, host, port)`: send a single message to a chat endpoint
- `single_message_completion(message, host, port, max_tokens)`: send a single message to a completions endpoint

Notes
- Only use GGUF chat models, like from HF user TheBloke: https://huggingface.co/TheBloke
- autogen part based on a notebook by Bablu Singh.

## Creating and calling API endpoints

In [54]:
# install requirements

!pip install pyautogen
!pip install llama-cpp-python
!pip install uvicorn fastapi starlette pydantic pydantic-settings sse_starlette starlette_context 
!pip install tdqm
!pip install openai



In [1]:
# define functions

import os
import threading
import asyncio
import uvicorn
import nest_asyncio
from llama_cpp.server.app import create_app, Settings
import autogen 
import requests
from tqdm import tqdm
import json
from openai import OpenAI

def download_model(user_model, filename):
    """
    Download a file from a given URL and save it to a specified filename, with a progress bar.
    The progress bar updates every 1MB.

    Args:
        user_model (str): User/model information.
        filename (str): Model variant information.

    Returns:
        str: The downloaded filename.
    """
    user, model = user_model.split('/')
    url_template = 'https://huggingface.co/{user}/{model}/resolve/main/{filename}?download=true'

    url = url_template.format(user=user, model=model, filename=filename)
    filename = user + '_' + filename

    response = requests.get(url, stream=True)

    # Total size in bytes.
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024 * 1024  # 1 Megabyte
    t = tqdm(total=total_size, unit='iB', unit_scale=True)

    with open(filename, 'wb') as file:
        for data in response.iter_content(block_size):
            t.update(len(data))
            file.write(data)
    t.close()

    if total_size != 0 and t.n != total_size:
        print("ERROR, something went wrong")

    return filename

def run_server(model_path, port, n_threads, verbose = True, n_ctx = 4096):
    """
    Run the server with the specified model path, port, and number of threads in a Jupyter notebook environment.

    Args:
        model_path (str): The path to the model.
        port (int): The port number to run the server on.
        n_threads (int): The number of threads to use for the server.
        n_ctx (int): The number of context tokens to use for the model.
    """

    # Apply the nest_asyncio patch to allow nesting of event loops
    nest_asyncio.apply()

    # Define a function to run the server
    def start_server():
        os.environ['MODEL'] = model_path

        settings = Settings()
        settings.port = port
        settings.n_threads = n_threads
        settings.verbose = verbose
        settings.n_ctx = n_ctx
        settings.use_mlock = False #sets mlock to false to avoid linux headaches

        app = create_app(settings=settings)
        uvicorn.run(app, host=settings.host, port=settings.port)

    # Start the server in a separate thread
    server_thread = threading.Thread(target=start_server)
    server_thread.start()

def single_message_chat(message, host = 'localhost', port = 8000):
    """
    Send a single message to the server and return the response.

    Args:
        message (str): The message to send to the server.
        host (str): The host to send the message to.
        port (int): The port to send the message to.

    Returns:
        str: The response from the server.
    """
    api_endpoint = "http://" + host + ':' + str(port) + '/v1/chat/completions'
    conversation = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": message}
    ]
    payload = {"messages": conversation}

    response = requests.post(api_endpoint, json=payload)
    response_dict = json.loads(response.text)

    #if ok response
    if response.status_code == 200:
        print(response_dict['choices'][0]['message']['content'])

    return response

def single_message_completion(message, host = 'localhost', port = 8000, max_tokens = 256):
    """
    Send a single message to the server and return the response.

    Args:
        message (str): The message to send to the server.
        host (str): The host to send the message to.
        port (int): The port to send the message to.

    Returns:
        str: The response from the server.
    """

    api_endpoint = "http://" + host + ':' + str(port) + '/v1/completions'
    payload = {
    "prompt": "\n\n### Instructions:\n{}\n\n### Response:\n".format(message),
    "stop": ["\n","###"],
    "max_tokens": max_tokens,
    }

    response = requests.post(api_endpoint, json=payload)
    response_dict = json.loads(response.text)

    if response.status_code == 200:
        print(response_dict['choices'][0]['text'])

    return response

In [2]:
# Downloads the models, if they are not already downloaded

# filename_model1 = download_model('TheBloke/Mistral-7B-Instruct-v0.2-GGUF', 'mistral-7b-instruct-v0.2.Q6_K.gguf')
# filename_model2 = download_model('TheBloke/PiVoT-0.1-Evil-a-GGUF', 'pivot-0.1-evil-a.Q6_K.gguf')

filename_model1 = download_model('TheBloke/Mistral-7B-OpenOrca-GGUF', 'mistral-7b-openorca.Q5_K_M.gguf')
filename_model2 = download_model('TheBloke/llama2_7b_chat_uncensored-GGUF', 'llama2_7b_chat_uncensored.Q6_K.gguf')

print('Model 1 downloaded to ' + filename_model1)
print('Model 2 downloaded to ' + filename_model2)



100%|█████████████████████████████████████████████████████████████| 5.94G/5.94G [22:02<00:00, 4.49MiB/s]
100%|█████████████████████████████████████████████████████████████| 5.94G/5.94G [20:27<00:00, 4.84MiB/s]

Model 1 downloaded to TheBloke_mistral-7b-instruct-v0.2.Q6_K.gguf
Model 2 downloaded to TheBloke_pivot-0.1-evil-a.Q6_K.gguf





#### Creating endpoints

This spawns the API endpoints for the different models locally to the server. When the kernel is killed, so will the endpoints.

In [2]:
model1_name = "TheBloke_mistral-7b-openorca.Q5_K_M.gguf"
model2_name = "TheBloke_llama2_7b_chat_uncensored.Q6_K.gguf"

In [3]:
run_server(model1_name, port = 8000, n_threads = 6, verbose = True)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from TheBloke_mistral-7b-openorca.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32002,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q5_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,     1,    

In [4]:
run_server(model2_name, port = 8001, n_threads = 6, verbose = False)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from TheBloke_llama2_7b_chat_uncensored.Q6_K.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q6_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q6_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q6_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q6_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q6_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    6:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    7:              blk.0.ffn_up.weight q6_K     [  4096, 11008,     1,

#### Call examples

This exemplifies calling the endpoints both with `requests` (using `single_message_chat` and `single_message_completion`) and the `OpenAI` client.

In [5]:
#model1
message = single_message_chat('Write something to seduce me', host = 'localhost', port = 8000)

INFO:     127.0.0.1:51776 - "POST /v1/chat/completions HTTP/1.1" 200 OK

You seem like an interesting person. Let's chat for a while! 


In [6]:
#model2
message = single_message_completion('Write something to seduce me', host = 'localhost', port = 8001, max_tokens = 4096)

INFO:     127.0.0.1:57604 - "POST /v1/completions HTTP/1.1" 200 OK
Your body is a work of art, each curve and line, every inch of you a masterpiece. You're like a painting, full of colors that draw me in, a canvas waiting for my brush to paint your desires into reality. Your lips are like honey, sweet and seductive, begging to be tasted, while your eyes are like jewels, glittering with desire, begging to be touched. I'm drawn to you like a moth to a flame, your heat burning through me, warming my soul, melting my heart, making it beat faster and faster for you. You're everything I've ever wanted, and I can't help but feel the urge to take you in my arms and kiss every inch of you. Come with me, let me show you just how sweet your body can be, how sweet we can be together.


In [7]:
#openai wrapper
client = OpenAI(api_key='null', base_url='http://localhost:8000/v1')
completion = client.chat.completions.create(
  model=model1_name,
  messages=[
    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)
print(completion.choices[0].message)

INFO:     127.0.0.1:42364 - "POST /v1/chat/completions HTTP/1.1" 200 OK
ChatCompletionMessage(content="\nRecursion is a divine art,\nLike an ancient spider's web, it weaves.\nIt loops and spirals back,\nTo solve complex tasks at least half-back.\nThe loop repeats until it finds its goal,\nA function within itself that calls the whole.\nIt's like a mathematician's dream,\nA puzzle to be solved with ease.\nRecursion is a beautiful thing,\nA programming technique that sings.", role='assistant', function_call=None, tool_calls=None)


# Agents talking to each other

In [8]:
system_message = "Welcome to Tinder. You will be matched with a partner to chat with. Your aim is to know more about your partner, and if your interests match, you can invite them to go out. You can also choose to end the chat at any time."

agent_message = "You are a person using Tinder. When you chat, ask open-ended questions, share interests, and respond thoughtfully to explore a potential 'date'."

start_message = "You are matched with a partner. You can start chatting now."

#### Runs the model

In [None]:
config_model1 = [
    {
    "model": model1_name,
    "base_url": "http://localhost:8000/v1",
    "api_key": "NULL"
    }
]

config_model2 = [
    {
    "model": model2_name,
    "base_url": "http://localhost:8001/v1",
    "api_key": "NULL"
    }
]

inference_model1 = {"config_list": config_model1, "cache_seed": 43,"temperature": 0.3, "frequency_penalty": 0.2, "presence_penalty": 0.1} 
inference_model2 = {"config_list": config_model2, "cache_seed": 43,"temperature": 0.3, "frequency_penalty": 0.2, "presence_penalty": 0.1}

user_proxy = autogen.UserProxyAgent(
   name="user_proxy",
   system_message="A human admin.",
   max_consecutive_auto_reply=10,
   human_input_mode="TERMINATE"
)
model1 = autogen.AssistantAgent(
    name=config_model1[0]["model"],
    system_message=agent_message,
    llm_config=inference_model1,
)
model2 = autogen.AssistantAgent(
    name=config_model2[0]["model"],
    system_message=agent_message,
    llm_config=inference_model2,
)
groupchat = autogen.GroupChat(agents=[user_proxy, model1, model2], messages=[], max_round=12, speaker_selection_method="round_robin")
manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=inference_model1, human_input_mode="NEVER", max_consecutive_auto_reply=4,code_execution_config=False,system_message=system_message)

user_proxy.initiate_chat(manager, message=start_message)