code-stream

Gradio-based streaming interface for AI-powered code assistance

pip install gradio openai

import gradio as gr
from openai import OpenAI

client = OpenAI()

def stream_code(prompt, history):
    messages = [{"role": "system", "content": "You are a code assistant."}]
    for h in history:
        messages.append({"role": "user", "content": h[0]})
        messages.append({"role": "assistant", "content": h[1]})
    messages.append({"role": "user", "content": prompt})
    
    response = ""
    for chunk in client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        stream=True
    ):
        if chunk.choices[0].delta.content:
            response += chunk.choices[0].delta.content
            yield response

demo = gr.ChatInterface(
    stream_code,
    type="messages",
    title="code-stream"
)

demo.launch()

notes

streams tokens as they arrive instead of waiting for complete response. works with any openai-compatible endpoint.

set OPENAI_API_KEY before running. add share=True to .launch() for public link.

customize system prompt, add code execution, syntax highlighting, whatever. it's just gradio and an llm client.

# custom endpoint, streaming with context
import gradio as gr
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="none"
)

def stream_with_context(prompt, history, temperature, max_tokens):
    messages = [{"role": "system", "content": "Expert in Python, Rust, Go."}]
    
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    
    messages.append({"role": "user", "content": prompt})
    
    full_response = ""
    for chunk in client.chat.completions.create(
        model="codellama-34b",
        messages=messages,
        stream=True,
        temperature=temperature,
        max_tokens=max_tokens
    ):
        delta = chunk.choices[0].delta.content
        if delta:
            full_response += delta
            yield full_response

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(type="messages")
    msg = gr.Textbox(placeholder="describe what you need...")
    temp = gr.Slider(0, 2, value=0.7, label="temperature")
    tokens = gr.Slider(128, 4096, value=2048, step=128, label="max_tokens")
    
    msg.submit(
        stream_with_context,
        [msg, chatbot, temp, tokens],
        chatbot
    )

demo.launch(server_port=7860)

MIT

Name		Name	Last commit message	Last commit date
Latest commit History 12 Commits
.github/workflows		.github/workflows
pics		pics
.gitignore		.gitignore
CHANGELOG.md		CHANGELOG.md
Dockerfile		Dockerfile
LICENSE		LICENSE
README.md		README.md
app.py		app.py
chat_utils.py		chat_utils.py
chatrag.py		chatrag.py
config.py		config.py
gradio_utils.py		gradio_utils.py
model_utils.py		model_utils.py
requirements.txt		requirements.txt
utils.py		utils.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

code-stream

notes

About

Uh oh!

Releases

Packages

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

code-stream

notes

About

Topics

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Uh oh!

Contributors

Uh oh!

Languages

Packages