## Azure Speech Studio 리소스를 활용하여, STT와 TTS 서비스 만들기

requests 관련해서는 postman의 AISpeech-STT와 AISpeech-TTS 참고

In [None]:
import requests
import gradio as gr
import re
import os
from dotenv import load_dotenv

#### 1. postman에서 진행했던 내용 python 코드로 재작성하기(STT, TTS)

#### STT

In [None]:
load_dotenv()
SPEECH_ENDPOINT = os.getenv("SPEECH_ENDPOINT")
SPEECH_APIKEY = os.getenv("SPEECH_APIKEY")

# stt = speech to text
def request_stt(file_path):
    endpoint = SPEECH_ENDPOINT
    headers = {
        "Ocp-Apim-Subscription-Key":SPEECH_APIKEY,
        "Content-Type":"audio/wav"
    }

    with open(file_path, "rb") as audio:
        audio_data = audio.read()

    response = requests.post(endpoint, headers=headers, data=audio_data)
    # print(response.status_code, response.text)

    if response.status_code == 200:
        response_json = response.json()
        text = response_json['DisplayText']
        return text
    else:
        return ""
    

result = request_stt("0318_AIspeech_python/audio1.wav")
# print(result)

with gr.Blocks() as demo:
    gr.Markdown("# AI Speech World!")

    def change_audio(audio_path):
        if audio_path:
            text = request_stt(audio_path)
            return text
        else:
            return ""
    gr.Markdown("### STT ###")
    with gr.Row():
        with gr.Column():
            input_mic = gr.Audio(label="마이크 입력", sources="microphone", type="filepath", show_download_button=True)
        with gr.Column():
            output_textbox = gr.Textbox(label="텍스트", interactive=False)
            # change 이벤트 추가
            input_mic.change(change_audio, inputs=[input_mic], outputs=[output_textbox])


demo.launch()

* Running on local URL:  http://127.0.0.1:7869

To create a public link, set `share=True` in `launch()`.




#### TTS
이후, STT와 TTS를 합칠 예정

In [None]:
SPEECH_ENDPOINT_TTS = os.getenv("SPEECH_ENDPOINT_TTS")

def request_tts(text):

    file_name = "tts_response_audio.wav"

    endpoint = SPEECH_ENDPOINT_TTS
    headers = {
        "Ocp-Apim-Subscription-Key":SPEECH_APIKEY,
        "Content-Type":"application/ssml+xml",
        "X-Microsoft-OutputFormat":"riff-44100hz-16bit-mono-pcm"
    }

    body = f"""
            <speak version='1.0' xml:lang='ko-kr'>
            <voice name='ko-KR-JiMinNeural'>
                <prosody rate="0%">
                    {text}
                </prosody>
            </voice>
        </speak>
    """

    response = requests.post(endpoint, headers=headers, data=body)
    # print(response.status_code, response.text)

    if response.status_code == 200:
        with open(file_name, "wb") as audio_file:
            audio_file.write(response.content)
        return file_name
    else:
        return None

with gr.Blocks() as demo2:

    def click_send(text):
        file_path = request_tts(text)
        if file_path:
            return file_path
        else:
            return None
        
    tts_textbox = gr.Textbox(label="입력", placeholder="음성 변환할 텍스트를 입력하세요.")
    send_tts_button = gr.Button("전송")

    output_tts_audio = gr.Audio(interactive=False, autoplay=True)
    send_tts_button.click(fn=click_send, inputs=[tts_textbox], outputs=[output_tts_audio])

# request_tts("안녕 반가워.")
demo2.launch()

* Running on local URL:  http://127.0.0.1:7888

To create a public link, set `share=True` in `launch()`.




#### 2. STT + TTS + 챗봇 합치기(실습)

In [None]:
# stt = speech to text
def request_stt(file_path):
    endpoint = SPEECH_ENDPOINT
    headers = {
        "Ocp-Apim-Subscription-Key":SPEECH_APIKEY,
        "Content-Type":"audio/wav"
    }

    with open(file_path, "rb") as audio:
        audio_data = audio.read()

    response = requests.post(endpoint, headers=headers, data=audio_data)
    # print(response.status_code, response.text)

    if response.status_code == 200:
        response_json = response.json()
        text = response_json['DisplayText']
        return text
    else:
        return ""
    

def request_tts(text):

    file_name = "tts_response_audio.wav"

    endpoint = SPEECH_ENDPOINT_TTS
    headers = {
        "Ocp-Apim-Subscription-Key":SPEECH_APIKEY,
        "Content-Type":"application/ssml+xml",
        "X-Microsoft-OutputFormat":"riff-44100hz-16bit-mono-pcm"
    }

    body = f"""
            <speak version='1.0' xml:lang='ko-kr'>
            <voice name='ko-KR-JiMinNeural'>
                <prosody rate="0%">
                    {text}
                </prosody>
            </voice>
        </speak>
    """

    response = requests.post(endpoint, headers=headers, data=body)
    # print(response.status_code, response.text)

    if response.status_code == 200:
        with open(file_name, "wb") as audio_file:
            audio_file.write(response.content)
        return file_name
    else:
        return None
    
def change_audio(audio_path):
    if audio_path:
        text = request_stt(audio_path)
        return text
    else:
        return ""
    
def click_send(text):
    file_path = request_tts(text)
    if file_path:
        return file_path
    else:
        return None

with gr.Blocks() as demo3:
    gr.Markdown("# AI Speech World!")
    gr.Markdown("### STT ###")
    with gr.Row():
        with gr.Column():
            input_mic = gr.Audio(label="마이크 입력", sources="microphone", type="filepath", show_download_button=True)
        with gr.Column():
            output_textbox = gr.Textbox(label="텍스트", interactive=False)
            # change 이벤트 추가
            input_mic.change(change_audio, inputs=[input_mic], outputs=[output_textbox])
    
    gr.Markdown("### TTS ###")
    with gr.Row():
        with gr.Column():
            tts_textbox = gr.Textbox(label="입력", placeholder="음성 변환할 텍스트를 입력하세요.")
            send_tts_button = gr.Button("전송")
        with gr.Column():
            output_tts_audio = gr.Audio(interactive=False, autoplay=True)
            send_tts_button.click(fn=click_send, inputs=[tts_textbox], outputs=[output_tts_audio])  # 버튼 클릭 이벤트 추가

demo3.launch()

    

* Running on local URL:  http://127.0.0.1:7872

To create a public link, set `share=True` in `launch()`.




#### 3. 최종 챗봇까지 붙여서 전부 합치기

In [21]:
from __future__ import annotations
from typing import Iterable
from gradio.themes.base import Base
from gradio.themes.utils import colors, fonts, sizes

class Seafoam(Base):
    def __init__(
        self,
        *,
        primary_hue: colors.Color | str = colors.blue,
        secondary_hue: colors.Color | str = colors.emerald,
        neutral_hue: colors.Color | str = colors.sky,
    ):
        super().__init__(
            primary_hue=primary_hue,
            secondary_hue=secondary_hue,
            neutral_hue=neutral_hue,
        )
        super().set(
            body_background_fill="repeating-linear-gradient(45deg, *primary_200, *primary_200 10px, *primary_50 10px, *primary_50 20px)",
            body_background_fill_dark="repeating-linear-gradient(45deg, *primary_800, *primary_800 10px, *primary_900 10px, *primary_900 20px)",
            button_primary_background_fill="linear-gradient(90deg, *primary_300, *secondary_400)",
            button_primary_background_fill_hover="linear-gradient(90deg, *primary_200, *secondary_300)",
            button_primary_text_color="white",
            button_primary_background_fill_dark="linear-gradient(90deg, *primary_600, *secondary_800)",
            slider_color="*secondary_300",
            slider_color_dark="*secondary_600",
            block_title_text_weight="600",
            block_border_width="3px",
            block_shadow="*shadow_drop_lg",
            button_primary_shadow="*shadow_drop_lg",
            button_large_padding="32px",
        )

In [None]:
OPENAI_ENDPOINT = os.getenv("OPENAI_ENDPOINT")
OPENAI_APIKEY = os.getenv("OPENAI_APIKEY")

# gpt-openai 챗봇
def request_gpt(prompt):
    endpoint = OPENAI_ENDPOINT
    headers = {
        "Content-Type": "application/json",
        "api-key": OPENAI_APIKEY
    }

    body = {
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": "너는 나를 도와주는 도우미야."
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ],
        "temperature": 0.7,
        "top_p": 0.9,
        "max_tokens": 800
    }

    response = requests.post(endpoint, headers=headers, json=body)
    if response.status_code == 200:        
        response_json = response.json()
        message = response_json['choices'][0]['message']
        content = message['content']
        return content
    else:
        return ""
    
# request_gpt("오늘 날씨 알려줘.")

# stt = speech to text
def request_stt(file_path):
    endpoint = SPEECH_ENDPOINT
    headers = {
        "Ocp-Apim-Subscription-Key":SPEECH_APIKEY,
        "Content-Type":"audio/wav"
    }

    with open(file_path, "rb") as audio:
        audio_data = audio.read()

    response = requests.post(endpoint, headers=headers, data=audio_data)
    # print(response.status_code, response.text)

    if response.status_code == 200:
        response_json = response.json()
        text = response_json['DisplayText']
        return text
    else:
        return ""

def request_tts(text):

    file_name = "tts_response_audio.wav"

    endpoint = SPEECH_ENDPOINT_TTS
    headers = {
        "Ocp-Apim-Subscription-Key":SPEECH_APIKEY,
        "Content-Type":"application/ssml+xml",
        "X-Microsoft-OutputFormat":"riff-44100hz-16bit-mono-pcm"
    }

    body = f"""
            <speak version='1.0' xml:lang='ko-kr'>
            <voice name='ko-KR-JiMinNeural'>
                <prosody rate="0%">
                    {text}
                </prosody>
            </voice>
        </speak>
    """

    response = requests.post(endpoint, headers=headers, data=body)
    # print(response.status_code, response.text)

    if response.status_code == 200:
        with open(file_name, "wb") as audio_file:
            audio_file.write(response.content)
        return file_name
    else:
        return None
    
def change_audio(audio_path):
    if audio_path:
        text = request_stt(audio_path)
        return text
    else:
        return ""
    
def click_send(text):
    file_path = request_tts(text)
    if file_path:
        return file_path
    else:
        return None

def click_gpt_send(prompt, histories):
    content = request_gpt(prompt)
    histories.append({"role":"user", "content":prompt})
    if content:
        histories.append({"role":"assistant", "content":content})
    else:
        histories.append({"role":"assistant", "content":"응답을 받지 못했습니다"})
    return "", histories

def change_chatbot(histories):
    history = histories[-1]
    content = history['content']
    pattern = r'^가가-힣a-zA-Z0-9\s'
    cleaned_content = re.sub(pattern, '', content)
    audio_path = request_tts(cleaned_content)
    return audio_path

seafoam = Seafoam()

with gr.Blocks(theme=seafoam) as demo_chat:
    gr.Markdown("# AI Speech World!")
    
    with gr.Row():
        #좌측
        with gr.Column(scale=4):
            chatbot = gr.Chatbot(type="messages")
            with gr.Row():
                prompt = gr.Textbox(label="프롬프트", scale=6)
                send_gpt_button = gr.Button("전송", scale=1)
                prompt.submit(click_gpt_send, inputs=[prompt, chatbot], outputs=[prompt, chatbot])

            gpt_audio = gr.Audio(interactive=False, autoplay=True)

        #우측
        with gr.Column(scale=1):
            #STT
            with gr.Column():
                gr.Markdown("### STT ###")
                input_mic = gr.Audio(label="마이크 입력", sources="microphone", type="filepath", show_download_button=True)
                output_textbox = gr.Textbox(label="텍스트", interactive=False)
                        
            with gr.Column():
                gr.Markdown("### TTS ###")
                tts_textbox = gr.Textbox(label="입력", placeholder="음성 변환할 텍스트를 입력하세요.")
                send_tts_button = gr.Button("전송")
                output_tts_audio = gr.Audio(interactive=False, autoplay=True)

    send_tts_button.click(fn=click_send, inputs=[tts_textbox], outputs=[output_tts_audio])
    send_gpt_button.click(click_gpt_send, inputs=[prompt, chatbot], outputs=[prompt, chatbot])
    input_mic.change(change_audio, inputs=[input_mic], outputs=[prompt])
    chatbot.change(change_chatbot, inputs=[chatbot], outputs=[gpt_audio])

demo_chat.launch()

* Running on local URL:  http://127.0.0.1:7877

To create a public link, set `share=True` in `launch()`.




#### 4. 커스텀 진행(개별적으로 수행)
STT에서는 마이크 외에 외부 파일을 입력받아 텍스트로 출력.
TTS에서 다양한 목소리 및 언어를 입력받아 읽어주는 내용 추가.

In [46]:
# 언어 및 목소리 선택 목록
languages = {
  "아프리칸스어(af-ZA)": "af-ZA",
  "알바니아어(sq-AL)": "sq-AL",
  "암하릭어(am-ET)": "am-ET",
  "아랍어(ar-EG)": "ar-EG",
  "아르메니아어(hy-AM)": "hy-AM",
  "아제르바이잔어(az-AZ)": "az-AZ",
  "인도네시아어(id-ID)": "id-ID",
  "벵골어(bn-BD)": "bn-BD",
  "바스크어(eu-ES)": "eu-ES",
  "벵골어(bn-IN)": "bn-IN",
  "보스니아어(bs-BA)": "bs-BA",
  "불가리아어(bg-BG)": "bg-BG",
  "버마어(my-MM)": "my-MM",
  "카탈로니아어(ca-ES)": "ca-ES",
  "중국어 광둥어(zh-HK)": "zh-HK",
  "중국어 북경어(zh-CN)": "zh-CN",
  "중국어 대만어(zh-TW)": "zh-TW",
  "크로아티아어(hr-HR)": "hr-HR",
  "체코어(cs-CZ)": "cs-CZ",
  "덴마크어(da-DK)": "da-DK",
  "네덜란드어(nl-NL)": "nl-NL",
  "영어(en-US)": "en-US",
  "에스토니아어(et-EE)": "et-EE",
  "필리핀어(fil-PH)": "fil-PH",
  "핀란드어(fi-FI)": "fi-FI",
  "프랑스어(fr-FR)": "fr-FR",
  "갈리시아어(gl-ES)": "gl-ES",
  "조지아어(ka-GE)": "ka-GE",
  "독일어(de-DE)": "de-DE",
  "그리스어(el-GR)": "el-GR",
  "히브리어(he-IL)": "he-IL",
  "힌디어(hi-IN)": "hi-IN",
  "헝가리어(hu-HU)": "hu-HU",
  "아이슬란드어(is-IS)": "is-IS",
  "아일랜드어(ga-IE)": "ga-IE",
  "이탈리아어(it-IT)": "it-IT",
  "일본어(ja-JP)": "ja-JP",
  "자바어(jv-ID)": "jv-ID",
  "칸나다어(kn-IN)": "kn-IN",
  "카자흐어(kk-KZ)": "kk-KZ",
  "크메르어(km-KH)": "km-KH",
  "한국어(ko-KR)": "ko-KR",
  "라오어(lo-LA)": "lo-LA",
  "라트비아어(lv-LV)": "lv-LV",
  "리투아니아어(lt-LT)": "lt-LT",
  "마케도니아어(mk-MK)": "mk-MK",
  "말레이어(ms-MY)": "ms-MY",
  "말라얄람어(ml-IN)": "ml-IN",
  "몰타어(mt-MT)": "mt-MT",
  "몽골어(mn-MN)": "mn-MN",
  "네팔어(ne-NP)": "ne-NP",
  "노르웨이어 보크말(nb-NO)": "nb-NO",
  "파슈토어(ps-AF)": "ps-AF",
  "페르시아어(fa-IR)": "fa-IR",
  "폴란드어(pl-PL)": "pl-PL",
  "포르투갈어(pt-BR)": "pt-BR",
  "루마니아어(ro-RO)": "ro-RO",
  "러시아어(ru-RU)": "ru-RU",
  "세르비아어(sr-RS)": "sr-RS",
  "싱할라어(si-LK)": "si-LK",
  "슬로바키아어(sk-SK)": "sk-SK",
  "슬로베니아어(sl-SI)": "sl-SI",
  "소말리아어(so-SO)": "so-SO",
  "스페인어(es-ES)": "es-ES",
  "순다어(su-ID)": "su-ID",
  "스와힐리어(sw-KE)": "sw-KE",
  "스웨덴어(sv-SE)": "sv-SE",
  "타밀어(ta-IN)": "ta-IN",
  "텔루구어(te-IN)": "te-IN",
  "태국어(th-TH)": "th-TH",
  "터키어(tr-TR)": "tr-TR",
  "우크라이나어(uk-UA)": "uk-UA",
  "우르두어(ur-PK)": "ur-PK",
  "우즈베크어(uz-UZ)": "uz-UZ",
  "베트남어(vi-VN)": "vi-VN",
  "웨일스어(cy-GB)": "cy-GB",
  "줄루어(zu-ZA)": "zu-ZA"
}

voices = {
    "en-US-AdamMultilingualNeural (남성)":"en-US-AdamMultilingualNeural",
    "en-US-NovaTurboMultilingualNeural (여성)":"en-US-FableTurboMultilingualNeural",
    "en-US-FableTurboMultilingualNeural (중립)":"en-US-NovaTurboMultilingualNeural",
    "en-GB-AdaMultilingualNeural (여성)":"en-GB-AdaMultilingualNeural",
    "en-GB-OllieMultilingualNeural (남성)":"en-GB-OllieMultilingualNeural",
    "de-de-SeraphinaMultilingualNeural (여성)":"de-de-SeraphinaMultilingualNeural",
    "de-de-FlorianMultilingualNeural (남성)":"de-de-FlorianMultilingualNeural",
    "es-es-ArabellaMultilingualNeural (여성)":"es-ES-ArabellaMultilingualNeural",
    "es-es-IsidoraMultilingualNeural (여성)":"es-ES-IsidoraMultilingualNeural",
    "fr-fr-LucienMultilingualNeural (남성)":"fr-FR-LucienMultilingualNeural",
    "fr-fr-VivienneMultilingualNeural (여성)":"fr-FR-VivienneMultilingualNeural",
    "it-it-AlessioMultilingualNeural (남성)":"it-IT-AlessioMultilingualNeural",
    "it-it-GiuseppeMultilingualNeural (남성)":"it-IT-GiuseppeMultilingualNeural",
    "ja-JP-MasaruMultilingualNeural (남성)":"ja-JP-MasaruMultilingualNeural",
    "ko-KR-HyunsuMultilingualNeural (남성)":"ko-KR-HyunsuMultilingualNeural",
    "pt-BR-MacerioMultilingualNeural (남성)":"pt-BR-MacerioMultilingualNeural",
    "pt-BR-ThalitaMultilingualNeural (여성)":"pt-BR-ThalitaMultilingualNeural",
    "zh-CN-XiaoxiaoMultilingualNeural (여성)":"zh-CN-XiaoxiaoMultilingualNeural",
    "zh-CN-YunxiaoMultilingualNeural (남성)":"zh-CN-YunxiaoMultilingualNeural"
}

In [None]:
# STT 요청 함수
def request_stt(file_path, language):
    endpoint = f"https://eastus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language={language}&format=detailed"
    headers = {
        "Ocp-Apim-Subscription-Key": "89NBwuM3CsNh2D7A2h7RDfJmPP0MTXiabbRrSe3RlHx1e7gIrqNTJQQJ99BCACYeBjFXJ3w3AAAYACOGnVCA",
        "Content-Type": "audio/wav"
    }

    with open(file_path, "rb") as audio:
        audio_data = audio.read()

    response = requests.post(endpoint, headers=headers, data=audio_data)

    if response.status_code == 200:
        response_json = response.json()
        return response_json.get("DisplayText", "인식 실패")
    else:
        return "오류 발생: STT 변환 실패"

# TTS 요청 함수
def request_tts(text, voice, language, speed):
    endpoint = "https://eastus.tts.speech.microsoft.com/cognitiveservices/v1"
    headers = {
        "Ocp-Apim-Subscription-Key": "89NBwuM3CsNh2D7A2h7RDfJmPP0MTXiabbRrSe3RlHx1e7gIrqNTJQQJ99BCACYeBjFXJ3w3AAAYACOGnVCA",
        "Content-Type": "application/ssml+xml",
        "X-Microsoft-OutputFormat": "riff-44100hz-16bit-mono-pcm"
    }

    body = f"""
        <speak version='1.0' xml:lang='{language}'>
            <voice name='{voice}'>
                <prosody rate="{speed}%">
                    {text}
                </prosody>
            </voice>
        </speak>
    """

    response = requests.post(endpoint, headers=headers, data=body)
    return response.content if response.status_code == 200 else None

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🎙️ AI Speech World - STT & TTS")

    # STT 영역
    gr.Markdown("## 🗣️ 음성 → 텍스트 (STT)")
    with gr.Row():
        with gr.Column():
            input_mic = gr.Audio(label="마이크 입력", sources="microphone", type="filepath", show_download_button=True)
            input_file = gr.File(label="음성 파일 업로드 (.wav)", type="filepath")
        with gr.Column():
            output_text = gr.Textbox(label="변환된 텍스트", interactive=False)

    def process_stt(audio_path, file_obj):
        file_path = audio_path if audio_path else (file_obj if file_obj else None)
        return request_stt(file_path) if file_path else "파일을 입력하세요."

    input_mic.change(process_stt, inputs=[input_mic, input_file], outputs=[output_text])
    input_file.change(process_stt, inputs=[input_mic, input_file], outputs=[output_text])

    # TTS 영역
    gr.Markdown("## 🔊 텍스트 → 음성 (TTS)")
    with gr.Row():
        tts_text = gr.Textbox(label="변환할 텍스트 입력", placeholder="음성으로 변환할 문장을 입력하세요.")
        voice_dropdown = gr.Dropdown(choices=list(voices.keys()), label="목소리 선택", value="en-US-AdamMultilingualNeural (남성)")
        language_dropdown_tts = gr.Dropdown(choices=list(languages.keys()), label="언어 선택", value="영어(en-US)")
        speed_slider = gr.Slider(minimum=-100, maximum=100, step=10, value=0, label="말하는 속도 (%)")
    
    output_audio = gr.Audio(label="TTS 음성 출력", interactive=False)
    tts_button = gr.Button("음성 변환")

    def process_tts(text, voice, language, speed):
        voice_code = voices.get(voice, "en-US-AdamMultilingualNeural")
        lang_code = languages.get(language, "ko-KR")
        audio_data = request_tts(text, voice_code, lang_code, speed)
        return audio_data if audio_data else "TTS 변환 실패"

    tts_button.click(process_tts, inputs=[tts_text, voice_dropdown, language_dropdown_tts, speed_slider], outputs=[output_audio])

demo.launch()


* Running on local URL:  http://127.0.0.1:7889

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\jooeu\anaconda3\envs\ms-ai\Lib\site-packages\gradio\queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jooeu\anaconda3\envs\ms-ai\Lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jooeu\anaconda3\envs\ms-ai\Lib\site-packages\gradio\blocks.py", line 2103, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jooeu\anaconda3\envs\ms-ai\Lib\site-packages\gradio\blocks.py", line 1650, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jooeu\anaconda3\envs\ms-ai\Lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return

In [53]:
import requests
import gradio as gr

# STT 요청 함수
def request_stt(file_path):
    endpoint = "https://eastus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?format=detailed"
    headers = {
        "Ocp-Apim-Subscription-Key": "89NBwuM3CsNh2D7A2h7RDfJmPP0MTXiabbRrSe3RlHx1e7gIrqNTJQQJ99BCACYeBjFXJ3w3AAAYACOGnVCA",
        "Content-Type": "audio/wav"
    }

    with open(file_path, "rb") as audio:
        audio_data = audio.read()

    response = requests.post(endpoint, headers=headers, data=audio_data)

    if response.status_code == 200:
        response_json = response.json()
        return response_json.get("DisplayText", "인식 실패")
    else:
        return "오류 발생: STT 변환 실패"

# TTS 요청 함수
def request_tts(text, voice, speed):
    endpoint = "https://eastus.tts.speech.microsoft.com/cognitiveservices/v1"
    headers = {
        "Ocp-Apim-Subscription-Key": "89NBwuM3CsNh2D7A2h7RDfJmPP0MTXiabbRrSe3RlHx1e7gIrqNTJQQJ99BCACYeBjFXJ3w3AAAYACOGnVCA",
        "Content-Type": "application/ssml+xml",
        "X-Microsoft-OutputFormat": "riff-44100hz-16bit-mono-pcm"
    }

    body = f"""
        <speak version='1.0'>
            <voice name='{voice}'>
                <prosody rate="{speed}%">
                    {text}
                </prosody>
            </voice>
        </speak>
    """

    response = requests.post(endpoint, headers=headers, data=body)
    return response.content if response.status_code == 200 else None

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🎙️ AI Speech World - STT & TTS")

    # STT 영역
    gr.Markdown("## 🗣️ 음성 → 텍스트 (STT)")
    with gr.Row():
        with gr.Column():
            input_mic = gr.Audio(label="마이크 입력", sources="microphone", type="filepath", show_download_button=True)
            input_file = gr.File(label="음성 파일 업로드 (.wav)", type="filepath")
        with gr.Column():
            output_text = gr.Textbox(label="변환된 텍스트", interactive=False)

    def process_stt(audio_path, file_obj):
        file_path = audio_path if audio_path else (file_obj if file_obj else None)
        return request_stt(file_path) if file_path else "파일을 입력하세요."

    input_mic.change(process_stt, inputs=[input_mic, input_file], outputs=[output_text])
    input_file.change(process_stt, inputs=[input_mic, input_file], outputs=[output_text])

    # TTS 영역
    gr.Markdown("## 🔊 텍스트 → 음성 (TTS)")
    with gr.Row():
        tts_text = gr.Textbox(label="변환할 텍스트 입력", placeholder="음성으로 변환할 문장을 입력하세요.")
        voice_dropdown = gr.Dropdown(choices=list(voices.keys()), label="목소리 선택", value="en-US-AdamMultilingualNeural (남성)")
        speed_slider = gr.Slider(minimum=-100, maximum=100, step=10, value=0, label="말하는 속도 (%)")
    
    output_audio = gr.Audio(label="TTS 음성 출력", interactive=False)
    tts_button = gr.Button("음성 변환")

    def process_tts(text, voice, speed):
        voice_code = voices.get(voice, "en-US-AdamMultilingualNeural")
        audio_data = request_tts(text, voice_code, speed)
        return audio_data if audio_data else "TTS 변환 실패"

    tts_button.click(process_tts, inputs=[tts_text, voice_dropdown, speed_slider], outputs=[output_audio])

demo.launch()


* Running on local URL:  http://127.0.0.1:7890

To create a public link, set `share=True` in `launch()`.


