<a href="https://colab.research.google.com/github/kutyadog/ai_notebooks/blob/main/RouteLLM_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Aug 2024

In [None]:
!pip install "routellm[serve,eval]"
!pip install gradio
!git clone https://github.com/open-webui/open-webui.git

In [None]:
# !docker run -d -p 3000:8080 --add-host=host.docker.internal:host-gateway -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main
!udocker --allow-root run -d -p 3000:8080 --add-host=host.docker.internal:host-gateway -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main

In [None]:
%%shell
pip install udocker
udocker --allow-root install

In [None]:
# !docker-init
!apt-get -qq install docker.io

In [None]:
import os
from google.colab import userdata
# Set the environment variable before importing the OpenAI library
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
# Replace with your model provider, we use Anyscale's Mixtral here.
# os.environ["ANYSCALE_API_KEY"] = userdata.get('ANYSCALE_API_KEY')

from routellm.controller import Controller

client = Controller(
  routers=["mf"],
  strong_model="gpt-4-1106-preview",
  weak_model="anyscale/mistralai/Mixtral-8x7B-Instruct-v0.1",
)

In [None]:
!python -m routellm.calibrate_threshold --routers mf --strong-model-pct 0.5 --config config.example.yaml
# For 50.0% strong model calls for mf, threshold = 0.11593
# !python -m routellm.openai_server --routers mf --strong-model gpt-4-1106-preview --weak-model anyscale/mistralai/Mixtral-8x7B-Instruct-v0.1
# !python -m examples.router_chat --router mf --threshold 0.11593

In [None]:
# @title Gradio interface for testing routes

import re
import gradio as gr
from routellm.controller import Controller

TEMPERATURE = 0.8
THRESHOLD = 0.11593
ROUTER = "mf"

os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')

client = Controller(
  routers=["mf"],
  strong_model="gpt-4-1106-preview",
  weak_model="groq/llama3-8b-8192"
)


# Notes:

# https://github.com/bnurbekov/LLM-Agents/blob/6118dcc59605e88030551b1a543bab66d6bb2416/route_llm/agent.py

# good examples of chat with rag, memory, etc.
# https://rito.hashnode.dev/building-rag-in-2024-with-langchain-groq-llama3-and-qdrant

# https://huggingface.co/spaces/routellm/demo/blob/main/app.py
# https://github.com/ralphbutler/LLM_misc/blob/36b470103ae1aa88d75e90c3ff1e4d7e9fd48674/routellm_demo1.py (wasnt working last check)
# https://www.youtube.com/watch?v=jc2RCG1Ys7g

# python embeddings, etc.
# https://www.linkedin.com/pulse/write-query-engine-olm-stack-groq-krishna-tripathi-ulj6f/
#

def predict(message, history, threshold, temperature):
    # Convert chat history to OpenAI format
    history_openai_format = [
        {"role": "system", "content": "You are a helpful AI assistant."}
    ]
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human})
        history_openai_format.append(
            {
                "role": "assistant",
                "content": re.sub(r"^\*\*\[.*?\]\*\*\s*", "", assistant),
            }
        )
    history_openai_format.append({"role": "user", "content": message})

    # Create a chat completion request and send it to the API server
    stream = client.chat.completions.create(
        model=f"router-{ROUTER}-{threshold}",  # Model name to use
        messages=history_openai_format,  # Chat history
        temperature=temperature,  # Temperature for text generation
        stream=True,  # Stream response
        max_tokens=512
    )
    print(stream)

    # Read and return generated text from response stream
    partial_message = ""
    for i, chunk in enumerate(stream):
        print(chunk)
        if i == 0:
            model_name = chunk.model
            model_prefix = f"**[{model_name}]**\n"
            yield model_prefix
            partial_message += model_prefix
        partial_message += chunk.choices[0].delta.content or ""
        yield partial_message


# Create and launch a chat interface with Gradio
demo = gr.ChatInterface(
    predict,
    additional_inputs=[
        gr.Slider(label="Threshold", minimum=0, maximum=1, value=THRESHOLD, step=0.01),
        gr.Slider(
            label="Temperature", minimum=0, maximum=1, value=TEMPERATURE, step=0.1
        ),
    ],
    title="RouteLLM",
    fill_height=True,
    description="This is a demo of our matrix factorization router, calibrated so that approximately 50% of calls (those that are harder) are routed to GPT-4, with remaining calls routed to Mixtral 8x7B.\n\nCheck out https://github.com/lm-sys/RouteLLM for details!",
)

demo.launch(debug=True)



Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://a81c8a9bb4de6a6227.gradio.live




In [None]:
# @title simple example to send message, get output and which model was used

import os

from routellm.controller import Controller

os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')

client = Controller(
  routers=["mf"],
  strong_model="gpt-4-1106-preview",
  weak_model="groq/llama3-8b-8192"
)

response = client.chat.completions.create(
  # This tells RouteLLM to use the MF router with a cost threshold of 0.11593
  model="router-mf-0.11593",
  messages=[
    {"role": "user", "content": "hello there! I am Chris."}
  ]
)

message_content = response['choices'][0]['message']['content']
model_name = response['model']

print(f"Message content: {message_content}")
print(f"Model name: {model_name}")

Message content: Hello Chris! It's nice to meet you! Is there something I can help you with or would you like to chat?
Model name: groq/llama3-8b-8192


# Trying open webui on colab

This works.

In [None]:
import urllib
print("Password/Enpoint IP for localtunnel is:",urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n"))

Password/Enpoint IP for localtunnel is: 35.229.207.30


In [None]:
!npm install localtunnel
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
!sudo apt-get update
!sudo apt-get install -y python3.11 python3.11-venv python3.11-dev

# Create and activate a virtual environment using Python 3.11
!python3.11 -m venv venv
!source venv/bin/activate

# Upgrade pip within the virtual environment
!venv/bin/python -m pip install --upgrade pip

# Install Open WebUI within the virtual environment
!venv/bin/pip install open-webui

# Create a script to start both servers asynchronously and expose them using localtunnel
with open('start_servers.py', 'w') as f:
    f.write('''
import subprocess
import threading
import os
import time

def start_ollama():
    subprocess.run(['ollama', 'serve'])

def download_model():
    subprocess.run(['ollama', 'pull', 'mistral-nemo'])

def start_open_webui():
    subprocess.run(['venv/bin/open-webui', 'serve', '--port', '8081'])

# Start servers in separate threads
threading.Thread(target=start_ollama).start()
time.sleep(5)
threading.Thread(target=download_model).start()
threading.Thread(target=start_open_webui).start()
''')

# Execute the script
!venv/bin/python start_servers.py && sleep 20 & npx localtunnel --port 8081