# Real-Time Speech-to-Text & Text-to-Speech
This notebook provides production-ready, real-time functions for Text-to-Speech (streaming directly to speakers) and Speech-to-Text (recording from microphone and transcribing).

In [6]:
import os
import io
import time
import pyaudio
import speech_recognition as sr
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables (e.g., OPENAI_API_KEY from .env)
load_dotenv()

# Initialize OpenAI Client
client = OpenAI()

In [7]:
def text_to_speech_stream(text: str) -> None:
    """
    Converts text to speech using OpenAI's TTS API and streams it directly to the speakers.
    Using the 'pcm' response format allows us to stream audio real-time without saving to a file.
    """
    print("\n[TTS] Generating and streaming audio...")
    
    # Request a streaming response
    with client.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice="alloy",
        input=text,
        response_format="pcm"
    ) as response:
        
        # Initialize PyAudio
        p = pyaudio.PyAudio()
        
        # OpenAI TTS PCM format is 24kHz, 1 channel, 16-bit
        stream = p.open(format=pyaudio.paInt16,
                        channels=1,
                        rate=24000,
                        output=True)
        
        # Stream bytes directly to the audio output device
        for chunk in response.iter_bytes(chunk_size=1024):
            if chunk:
                stream.write(chunk)
                
        # Clean up
        time.sleep(0.1) # Small delay to ensure last chunk plays fully
        stream.stop_stream()
        stream.close()
        p.terminate()
        
    print("[TTS] Finished speaking.")

In [8]:
def record_and_transcribe() -> str:
    """
    Records audio from the microphone until silence is detected, 
    then transcribes it using OpenAI's Whisper model.
    """
    r = sr.Recognizer()
    r.energy_threshold = 300
    r.dynamic_energy_threshold = True
    
    with sr.Microphone() as source:
        print("\n[STT] Adjusting for ambient noise... Please wait.")
        r.adjust_for_ambient_noise(source, duration=0.8)
        print("[STT] \033[92mListening... (Speak now. It will stop automatically when you stop speaking.)\033[0m")
        
        try:
            # Listen for up to 15 seconds of audio
            audio = r.listen(source, timeout=5, phrase_time_limit=15)
        except sr.WaitTimeoutError:
            print("[STT] Listening timed out while waiting for phrase to start")
            return ""
            
    print("[STT] \033[93mRecording complete. Transcribing...\033[0m")
    
    # Get in-memory WAV data
    wav_data = audio.get_wav_data()
    
    # Create an in-memory file-like object
    audio_file = io.BytesIO(wav_data)
    audio_file.name = "audio.wav"
    
    # Send to OpenAI Whisper API
    transcription = client.audio.transcriptions.create(
        model="whisper-1", 
        file=audio_file, 
        response_format="text"
    )
    
    print("\n[STT] \033[96mTranscription:\n\033[0m", transcription)
    return transcription

### Test the Functions
Run the cells below to execute the real-time Text-to-Speech and Speech-to-Text.

In [9]:
# 1. Test Text-To-Speech Stream
text_to_speech_stream("Hello, I am ready to convert your speech to text and talk back in real time.")


[TTS] Generating and streaming audio...
[TTS] Finished speaking.


In [10]:
# 2. Test Speech-To-Text
user_input = record_and_transcribe()


[STT] Adjusting for ambient noise... Please wait.
[STT] [92mListening... (Speak now. It will stop automatically when you stop speaking.)[0m
[STT] [93mRecording complete. Transcribing...[0m

[STT] [96mTranscription:
[0m Â¡Muy amiguitos mios!

