# Multimodal chatbot with streaming

Features a Gradio UI, text and audio streaming, use of the system prompt to add expertise, and the ability to switch between models. TODO: add tools!

Example commercial applications: a language tutor, a company onboarding solution, a companion AI to a course, etc.

In [None]:
from dotenv import load_dotenv
import litellm
import gradio as gr
import base64, io, os, struct
import json

In [None]:
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

def validate_api_keys(api_keys, prefixes):
    for api_key, prefix in zip(api_keys, prefixes):
        if not api_key or not api_key.startswith(prefix) or len(api_key) <= 10:
            raise ValueError(f"Invalid API key: {api_key}")

validate_api_keys(
    api_keys=[openai_api_key, groq_api_key],
    prefixes=['sk-proj-', 'gsk_']
)

In [None]:
MODEL_CHOICES_AUDIO = [
    'gpt-4o-mini-audio-preview',
    'gpt-4o-audio-preview',
]

MODEL_CHOICES_TEXT = [
    'groq/openai/gpt-oss-120b',
    'gpt-5-nano-2025-08-07',
    'gpt-4.1-nano-2025-04-14',
]

MODEL_CHOICES_STT = [
    "gpt-4o-mini-transcribe",
    "gpt-4o-transcribe",
    "whisper-1",
]

In [None]:
sys_prompt_default = """You are a helpful programming assistant.
You provide exhaustive and detailed answers to programming questions,
with relevant code examples."""

sys_prompt = input("Enter system prompt: ") or sys_prompt_default

In [None]:
def make_wav_header(data_size, sample_rate=24000, num_channels=1, bits_per_sample=16):
    """Return a 44-byte WAV header for a PCM16 chunk."""
    byte_rate = sample_rate * num_channels * bits_per_sample // 8
    block_align = num_channels * bits_per_sample // 8
    riff_chunk_size = 36 + data_size

    return struct.pack(
        '<4sI4s4sIHHIIHH4sI',
        b'RIFF',
        riff_chunk_size,
        b'WAVE',
        b'fmt ',
        16,
        1,                   # PCM
        num_channels,
        sample_rate,
        byte_rate,
        block_align,
        bits_per_sample,
        b'data',
        data_size
    )

def pcm16_to_wav(pcm_bytes, sample_rate=24000):
    header = make_wav_header(len(pcm_bytes), sample_rate)
    return header + pcm_bytes

In [None]:
def messages_from(history):
    return [{"role": "system", "content": sys_prompt}] + \
        [{"role": h["role"], "content": h["content"]} for h in history]

def add_to_history(history, prompt, role):
    return history + [{"role": role, "content": prompt}]

In [None]:
def chat(model, history):
    messages = messages_from(history)
    response = litellm.completion(
        model=model,
        messages=messages,
        stream=True,
    )

    text_stream = ""
    for chunk in response:
        delta = chunk.choices[0].delta.content or ""
        text_stream += delta
        yield add_to_history(messages, text_stream, "assistant")

def chat_with_audio(model, history):
    messages = messages_from(history)
    response = litellm.completion(
        model=model,
        messages=messages,
        stream=True,
        modalities=["text", "audio"],
        audio={"voice": "alloy", "format": "pcm16"},
    )

    text_stream = []
    audio_buffer = io.BytesIO()
    chunk_count = 0

    for chunk in response:
        if not (audio := getattr(chunk.choices[0].delta, "audio", None)):
            continue
        
        # stream text
        if text_chunk := audio.get("transcript"):
            text_stream.append(text_chunk)
            yield add_to_history(messages, "".join(text_stream), "assistant"), gr.update()
            chunk_count += 1

        # stream audio
        if audio_data := audio.get("data"):
            audio_pcm = base64.b64decode(audio_data)
            audio_buffer.write(audio_pcm)
            # periodically yield audio chunks
            if chunk_count >= 20:
                audio_wav = pcm16_to_wav(audio_buffer.getvalue())
                yield gr.update(), audio_wav
                # reset buffer
                audio_buffer.seek(0)
                audio_buffer.truncate(0)
                chunk_count = 0

    if audio_buffer.getbuffer().nbytes > 0:
        yield gr.update(), pcm16_to_wav(audio_buffer.getvalue())

def speech_to_text(audio, model):
    res = litellm.transcription(
        file=open(audio, "rb"),
        model=model,
        response_format="text",
        stream=False,
    )
    return res.text.strip()

def speech_to_text_stream(audio, model):
    response = litellm.transcription(
        file=open(audio, "rb"),
        model=model,
        response_format="json",
        stream=True,
    )
    for _, chunk_data in response:
        if not chunk_data:
            continue
        chunk_lines = chunk_data.replace('data: ', '').strip().split('\r\n\r\n')
        for line in chunk_lines:
            try:
                chunk_json = json.loads(line)
            except json.JSONDecodeError:
                continue
            if chunk_json["type"] == "transcript.text.delta":
                yield chunk_json["delta"]
            elif chunk_json["type"] == "transcript.text.done":
                yield chunk_json["text"]

In [None]:
def update_chatbot_msg(hist, msg):
    return add_to_history(hist, msg, "user"), ''

def update_chatbot_audio(hist, audio, transcr_model):
    transcript = speech_to_text(audio, transcr_model)
    return add_to_history(hist, transcript, "user"), None

def update_chatbot_audio_stream(hist, audio, transcr_model):
    for chunk in speech_to_text_stream(audio, transcr_model):
        yield add_to_history(hist, chunk, "user"), None

In [None]:
with gr.Blocks() as demo:
    with gr.Row():
        text_model = gr.Dropdown(choices=MODEL_CHOICES_TEXT, value=MODEL_CHOICES_TEXT[0], label="Text Model")
        audio_model = gr.Dropdown(choices=MODEL_CHOICES_AUDIO, value=MODEL_CHOICES_AUDIO[0], label="Audio Model")
        stt_model = gr.Dropdown(choices=MODEL_CHOICES_STT, value=MODEL_CHOICES_STT[0], label="Transcription Model")
    with gr.Row():
        chatbot = gr.Chatbot()
    with gr.Row():
        msg = gr.Textbox(label="Ask me a technical question")
    with gr.Row():
        audio_in = gr.Audio(sources=["microphone"], type="filepath", label="")
        audio_out = gr.Audio(autoplay=True, streaming=True)

    msg.submit(update_chatbot_msg, inputs=[chatbot, msg], outputs=[chatbot, msg]) \
        .then(lambda: (None, None), outputs=[audio_in, audio_out]) \
        .then(chat, inputs=[text_model, chatbot], outputs=[chatbot])

    audio_in.stop_recording(update_chatbot_audio, inputs=[chatbot, audio_in, stt_model], outputs=[chatbot, audio_in]) \
        .then(lambda: "", outputs=[msg]) \
        .then(chat_with_audio, inputs=[audio_model, chatbot], outputs=[chatbot, audio_out])

demo.launch(inbrowser=True, share=False, debug=False)