In [1]:
#!pip install fastapi uvicorn[standard] whisper transformers cosyvoice torchaudio nest-asyncio pyngrok

In [None]:
# Step 0: Install required packages (run once)
# !pip install openai-whisper transformers torch torchaudio --quiet
# Note: Skip CozyVoice for now, we'll use a placeholder

In [4]:
# Issue: echo user's content

#Version 3: change to use tiny model with test.wav

# Step 1: Imports
import whisper
from transformers import pipeline
import torchaudio

# Step 2: ASR - Whisper Tiny
asr_model = whisper.load_model("tiny")  # lightweight

def transcribe_audio(audio_bytes):
    with open("temp.wav", "wb") as f:
        f.write(audio_bytes)
    result = asr_model.transcribe("temp.wav")
    return result["text"]

# Test ASR
#with open("test.wav", "rb") as f:
#    audio_bytes = f.read()

user_text = transcribe_audio(audio_bytes)
print("Transcribed Text:", user_text)

# Step 3: LLM - lightweight text generation
llm = pipeline("text-generation", model="EleutherAI/gpt-neo-125M")  # tiny model
conversation_history = []

def generate_response(user_text):
    conversation_history.append({"role": "user", "text": user_text})
    prompt = ""
    for turn in conversation_history[-5:]:
        prompt += f"{turn['role']}: {turn['text']}\n"
    outputs = llm(prompt, max_new_tokens=50)
    bot_response = outputs[0]["generated_text"]
    conversation_history.append({"role": "assistant", "text": bot_response})
    return bot_response

bot_text = generate_response(user_text)
print("Bot Response:", bot_text)

# Step 4: TTS placeholder
# Since CozyVoice is heavy, we can generate a simple TTS using torchaudio + pre-recorded voice, or just save text
def synthesize_speech(text):
    # placeholder: save text to a file (or use simple pyttsx3 for local TTS)
    audio_path = "response.wav"
    with open("response.txt", "w", encoding="utf-8") as f:
        f.write(text)
    return audio_path

audio_path = synthesize_speech(bot_text)
print("Audio saved at:", audio_path)


Transcribed Text:  Hello, where's the capital of Canada?


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot Response: user:  Hello, where's the capital of Canada?
                                                  
Audio saved at: response.wav


In [2]:
# Good
# model="EleutherAI/gpt-neo-1.3B",
# model="EleutherAI/gpt-neo-125M",

# Step 1: Imports
import whisper
from transformers import pipeline
import torchaudio

asr_model = whisper.load_model("small")  # lightweight

def transcribe_audio(audio_bytes):
    with open("temp.wav", "wb") as f:
        f.write(audio_bytes)
    result = asr_model.transcribe("temp.wav")
    return result["text"]

# Test ASR
with open("test.wav", "rb") as f:
   audio_bytes = f.read()

user_text = transcribe_audio(audio_bytes)
print("Transcribed Text:", user_text)

from transformers import pipeline

# Use FLAN-T5 instead of GPT-Neo
# small = faster but weaker; base = better but slower
llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    device=-1  # -1 = CPU; change to 0 for GPU
)

conversation_history = []

def generate_response(user_text):
    conversation_history.append({"role": "user", "text": user_text})

    # Build concise prompt for instruction-tuned model
    prompt = "You are a friendly, factual assistant.\n"
    for turn in conversation_history[-5:]:
        prompt += f"User: {turn['text']}\n"
    prompt += "Assistant:"

    outputs = llm(prompt, max_new_tokens=150)
    bot_response = outputs[0]["generated_text"].strip()
    conversation_history.append({"role": "assistant", "text": bot_response})
    return bot_response

bot_text = generate_response(user_text)
print("Bot Response:", bot_text)

# Step 4: TTS placeholder
# Since CozyVoice is heavy, we can generate a simple TTS using torchaudio + pre-recorded voice, or just save text
def synthesize_speech(text):
    # placeholder: save text to a file (or use simple pyttsx3 for local TTS)
    audio_path = "response.wav"
    with open("response.txt", "w", encoding="utf-8") as f:
        f.write(text)
    return audio_path

audio_path = synthesize_speech(bot_text)
print("Audio saved at:", audio_path)


Transcribed Text:  Hello, where is the capital of Canada?


Device set to use cpu


Bot Response: The capital of Canada is Ottawa.
Audio saved at: response.wav


In [3]:
import gradio as gr

def voice_chat(audio):
    if audio is None:
        return None, ""
    
    with open(audio, "rb") as f:
        audio_bytes = f.read()
    
    user_text = transcribe_audio(audio_bytes)
    print(f"User said: {user_text}")
    
    bot_text = generate_response(user_text)
    print(f"Bot responds: {bot_text}")
    
    audio_path = synthesize_speech(bot_text, "gradio_response.wav")
    
    return audio_path, bot_text

gr.Interface(
    fn=voice_chat,
    inputs=gr.Audio(sources=["microphone"], type="filepath"),
    outputs=[
        gr.Audio(label="Bot Voice Response"),
        gr.Textbox(label="Bot Text Response", lines=5)
    ]
).launch()


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "C:\Users\FeiFei\AppData\Roaming\Python\Python313\site-packages\uvicorn\protocols\http\h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        self.scope, self.receive, self.send
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "C:\Users\FeiFei\AppData\Roaming\Python\Python313\site-packages\uvicorn\middleware\proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\FeiFei\AppData\Roaming\Python\Python313\site-packages\fastapi\applications.py", line 1134, in __call__
    await super().__call__(scope, receive, send)
  File "C:\Users\FeiFei\AppData\Roaming\Python\Python313\site-packages\starlette\applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  Fil

User said:  Hello, who is the President of the United States?
Bot responds: The President of the United States is Donald Trump.


Traceback (most recent call last):
  File "C:\Users\FeiFei\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
    )
    ^
  File "C:\Users\FeiFei\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<11 lines>...
    )
    ^
  File "C:\Users\FeiFei\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\blocks.py", line 2116, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<8 lines>...
    )
    ^
  File "C:\Users\FeiFei\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\blocks.py", line 1623, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  

User said:  How many populations are there in Canada?
Bot responds: User: Canada has a population of over 1.2 million people.


Traceback (most recent call last):
  File "C:\Users\FeiFei\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
    )
    ^
  File "C:\Users\FeiFei\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<11 lines>...
    )
    ^
  File "C:\Users\FeiFei\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\blocks.py", line 2116, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<8 lines>...
    )
    ^
  File "C:\Users\FeiFei\AppData\Local\Programs\Python\Python313\Lib\site-packages\gradio\blocks.py", line 1623, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  