In [8]:
import os
import io
import json
import base64
import asyncio
import websockets
from pydub import AudioSegment
import soundfile as sf
import gradio as gr
import nest_asyncio

# Allow asyncio to run in Jupyter Notebook
nest_asyncio.apply()

In [14]:
# OpenAI 웹소켓 연결 설정
async def connect_to_openai_websocket(audio_event):
    url = "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01"
    headers = {
        "Authorization": f"Bearer XXX",
        "OpenAI-Beta": "realtime=v1",
    }

    async with websockets.connect(url, extra_headers=headers) as ws:
        print("Connected to server.")

        # Send the audio event
        await ws.send(audio_event)
        print("Audio event sent.")

        async for message in ws:
            event = json.loads(message)

            if event.get('type') == 'conversation.item.created':
                # Send a command to create a response
                response_message = {"type": "response.create"}
                await ws.send(json.dumps(response_message))
                print("Response create command sent.")

                audio_data_list = []

                async for message in ws:
                    event = json.loads(message)

                    if event.get('type') == 'response.audio.delta':  # Handle audio chunks
                        audio_data_list.append(event['delta'])

                    if event.get('type') == 'response.audio.done':  # Complete audio response
                        full_audio_base64 = ''.join(audio_data_list)
                        audio_data = base64.b64decode(full_audio_base64)
                        return audio_data

In [10]:

# NumPy 오디오 데이터를 WAV 포맷으로 변환
def numpy_to_audio_bytes(audio_np, sample_rate):
    with io.BytesIO() as buffer:
        sf.write(buffer, audio_np, samplerate=sample_rate, format='WAV')
        buffer.seek(0)
        wav_bytes = buffer.read()
    return wav_bytes

# 오디오 이벤트 생성
def audio_to_item_create_event(audio_data: tuple) -> str:
    sample_rate, audio_np = audio_data
    audio_bytes = numpy_to_audio_bytes(audio_np, sample_rate)
    pcm_base64 = base64.b64encode(audio_bytes).decode('utf-8')

    event = {
        "type": "conversation.item.create",
        "item": {
            "type": "message",
            "role": "user",
            "content": [{
                "type": "input_audio",
                "audio": pcm_base64
            }]
        }
    }
    return json.dumps(event)


In [11]:

# 음성 응답 함수
def voice_chat_response(audio_data, history):
    audio_event = audio_to_item_create_event(audio_data)

    loop = asyncio.get_event_loop()
    if loop.is_running():
        audio_response = asyncio.ensure_future(connect_to_openai_websocket(audio_event))
    else:
        audio_response = asyncio.run(connect_to_openai_websocket(audio_event))

    if isinstance(audio_response, bytes):
        audio_io = io.BytesIO(audio_response)
        audio_segment = AudioSegment.from_raw(
            audio_io, 
            sample_width=2, 
            frame_rate=24000, 
            channels=1
        )
        
        with io.BytesIO() as buffered:
            audio_segment.export(buffered, format="wav")
            return buffered.getvalue(), history

    return None, history


In [12]:


# Gradio UI 설정
with gr.Blocks(title="OpenAI Realtime API") as demo:
    gr.Markdown("<h1 style='text-align: center;'>OpenAI Realtime API</h1>")

    with gr.Tab("VoiceChat"):
        gr.Markdown("음성으로 실시간 대화를 해보세요")

        audio_input = gr.Audio(
            label="Record your voice",
            sources="microphone",
            type="numpy",
            render=True
        )
        
        audio_output = gr.Audio(
            autoplay=True,
            render=True
        )
        
        history_state = gr.State([])

        gr.Interface(
            fn=voice_chat_response,
            inputs=[audio_input, history_state],
            outputs=[audio_output, history_state]
        )

In [15]:
demo.launch(share=True) 

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
* Running on public URL: https://1681eceb03f3159f56.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Connected to server.
Audio event sent.
Response create command sent.
Connected to server.
Audio event sent.
Response create command sent.
