In [None]:
import pyaudio
import wave
import webrtcvad
import struct

class AudioRecorder:
    def __init__(self):
        # Audio configuration
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.RATE = 16000  # WebRTC VAD requires 16000Hz
        self.CHUNK = 480   # 30ms at 16000Hz - WebRTC VAD expects 10, 20, or 30ms frames
        self.SILENCE_THRESHOLD = 3  # Number of silent chunks before stopping
        
        # Initialize PyAudio
        self.audio = pyaudio.PyAudio()
        
        # Initialize VAD
        self.vad = webrtcvad.Vad()
        self.vad.set_mode(1)  # 0: Least aggressive, 3: Most aggressive
        
    def is_speech(self, frame):
        """Check if a frame contains speech."""
        try:
            return self.vad.is_speech(frame, self.RATE)
        except Exception as e:
            print(f"Error processing frame: {e}")
            return False
            
    def record_audio(self, silence_timeout=2):
        """Record audio when speech is detected."""
        frames = []
        recording = False
        silent_chunks = 0
        
        # Open stream
        stream = self.audio.open(
            format=self.FORMAT,
            channels=self.CHANNELS,
            rate=self.RATE,
            input=True,
            frames_per_buffer=self.CHUNK
        )
        
        print("Listening for speech...")
        
        try:
            while True:
                frame = stream.read(self.CHUNK, exception_on_overflow=False)
                
                # Check if frame contains speech
                is_speech = self.is_speech(frame)
                
                if is_speech:
                    if not recording:
                        print("Speech detected - Recording started.")
                        recording = True
                    frames.append(frame)
                    silent_chunks = 0
                elif recording:
                    silent_chunks += 1
                    frames.append(frame)
                    
                    # Stop recording after silence_timeout seconds of silence
                    if silent_chunks > (silence_timeout * self.RATE) // self.CHUNK:
                        print("Silence detected - Recording stopped.")
                        break
                        
        except KeyboardInterrupt:
            print("\nRecording interrupted by user")
        finally:
            # Clean up
            stream.stop_stream()
            stream.close()
            
        return frames
        
    def save_audio(self, frames, filename="output.wav"):
        """Save recorded frames to a WAV file."""
        if not frames:
            print("No audio frames to save")
            return
            
        try:
            with wave.open(filename, 'wb') as wf:
                wf.setnchannels(self.CHANNELS)
                wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
                wf.setframerate(self.RATE)
                wf.writeframes(b''.join(frames))
            print(f"Audio saved as {filename}")
        except Exception as e:
            print(f"Error saving audio: {e}")
            
    def cleanup(self):
        """Clean up PyAudio resources."""
        self.audio.terminate()


In [None]:
# Example usage
if __name__ == "__main__":
    recorder = AudioRecorder()
    try:
        frames = recorder.record_audio()
        recorder.save_audio(frames)
    finally:
        recorder.cleanup()

In [None]:
import speech_recognition as sr

def recognize_speech_from_microphone():
    recognizer = sr.Recognizer()

    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    try:
        text = recognizer.recognize_google(audio)
        print("You said:", text)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError:
        print("Could not request results from Google Web Speech API")

recognize_speech_from_microphone()


In [None]:
from gtts import gTTS
from io import BytesIO
from pydub import AudioSegment
from pydub.playback import play

def speak_text(text, lang='hi'):
    """
    Convert text to speech and play it directly without saving to file
    
    Parameters:
    text (str): The text to convert to speech
    lang (str): Language code (default: 'en' for English)
    
    """
    # Create a BytesIO buffer
    mp3_fp = BytesIO()
    
    # Convert text to speech and write to buffer
    tts = gTTS(text=text, lang=lang)
    tts.write_to_fp(mp3_fp)
    mp3_fp.seek(0)
    
    # Convert to audio and play
    audio = AudioSegment.from_mp3(mp3_fp)
    play(audio)

# Example usage
if __name__ == "__main__":
    # Test the function
    text = "Aapp kaise hoooo yar"
    speak_text(text)

In [None]:
import speech_recognition as sr
from gtts import gTTS
from io import BytesIO
from pydub import AudioSegment
from pydub.playback import play
import logging
from typing import Optional

class AudioChat:
    """
    A class to handle text-to-speech and speech-to-text conversion.
    
    Attributes:
        lang (str): Language code for text-to-speech conversion (default: 'en')
        logger (logging.Logger): Logger instance for the class
    """
    
    def __init__(self, lang: str = 'en'):
        """
        Initialize AudioChat with specified language.
        
        Args:
            lang (str): Language code for text-to-speech (e.g., 'en' for English, 'hi' for Hindi)
        """
        self.lang = lang
        
        # Set up logging
        self.logger = logging.getLogger(__name__)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        self.logger.setLevel(logging.INFO)

    def speak_text(self, text: str) -> None:
        """
        Convert text to speech and play it directly without saving to file.
        
        Args:
            text (str): The text to convert to speech
            
        Raises:
            ValueError: If text is empty or not a string
            RuntimeError: If audio playback fails
        """
        if not isinstance(text, str) or not text.strip():
            raise ValueError("Text must be a non-empty string")
            
        try:
            self.logger.info(f"Converting text to speech: {text[:50]}...")
            
            # Create a BytesIO buffer
            mp3_fp = BytesIO()
            
            # Convert text to speech and write to buffer
            tts = gTTS(text=text, lang=self.lang)
            tts.write_to_fp(mp3_fp)
            mp3_fp.seek(0)
            
            # Convert to audio and play
            audio = AudioSegment.from_mp3(mp3_fp)
            play(audio)
            
            self.logger.info("Text-to-speech playback completed successfully")
            
        except Exception as e:
            self.logger.error(f"Error in text-to-speech conversion: {str(e)}")
            raise RuntimeError(f"Failed to convert or play text: {str(e)}")
        
        finally:
            mp3_fp.close()

    def recognize_speech(self, timeout: Optional[float] = None, phrase_time_limit: Optional[float] = None) -> str:
        """
        Record speech from microphone and convert to text.
        
        Args:
            timeout (float, optional): Maximum number of seconds for waiting for phrase to start
            phrase_time_limit (float, optional): Maximum number of seconds for a phrase
            
        Returns:
            str: The recognized text
            
        Raises:
            RuntimeError: If speech recognition fails
            TimeoutError: If recording times out
        """
        recognizer = sr.Recognizer()
        
        try:
            with sr.Microphone() as source:
                self.logger.info("Adjusting for ambient noise...")
                recognizer.adjust_for_ambient_noise(source, duration=1)
                
                self.logger.info("Listening for speech...")
                audio = recognizer.listen(source, 
                                        timeout=timeout,
                                        phrase_time_limit=phrase_time_limit)
                
                self.logger.info("Processing speech...")
                text = recognizer.recognize_google(audio, language=self.lang)
                self.logger.info(f"Successfully recognized: {text}")
                return text
                
        except sr.WaitTimeoutError:
            self.logger.error("Timeout occurred while waiting for speech")
            raise TimeoutError("No speech detected within timeout period")
            
        except sr.UnknownValueError:
            self.logger.error("Speech was not understood")
            raise RuntimeError("Could not understand audio")
            
        except sr.RequestError as e:
            self.logger.error(f"Could not request results from speech recognition service: {str(e)}")
            raise RuntimeError(f"Speech recognition service error: {str(e)}")
            
        except Exception as e:
            self.logger.error(f"Unexpected error in speech recognition: {str(e)}")
            raise RuntimeError(f"Speech recognition failed: {str(e)}")

    def have_conversation(self, initial_prompt: str = "Speak something...") -> None:
        """
        Conduct a conversation by alternating between speech recognition and text-to-speech.
        
        Args:
            initial_prompt (str): The initial prompt to speak
            
        Note:
            Press Ctrl+C to end the conversation
        """
        try:
            self.speak_text(initial_prompt)
            
            while True:
                try:
                    # Listen for user input
                    user_text = self.recognize_speech(timeout=5, phrase_time_limit=10)
                    
                    if user_text.lower() in ['quit', 'exit', 'stop', 'bye']:
                        self.speak_text("Goodbye!")
                        break
                        
                    # Here you can add your chatbot logic to process user_text
                    # For now, we'll just echo back
                    response = f"You said: {user_text}"
                    self.speak_text(response)
                    
                except TimeoutError:
                    self.speak_text("I didn't hear anything. Could you please speak again?")
                except RuntimeError as e:
                    self.speak_text("I'm having trouble understanding. Could you please repeat?")
                    
        except KeyboardInterrupt:
            self.speak_text("Conversation ended. Goodbye!")
            self.logger.info("Conversation ended by user")

In [None]:
import speech_recognition as sr
from gtts import gTTS
from io import BytesIO
from pydub import AudioSegment
from pydub.playback import play
import logging
from typing import Optional
import threading
import queue
import time
import audioop
import math

class ContinuousListener(sr.AudioSource):
    """
    A custom audio source that continuously records until a significant pause is detected.
    """
    def __init__(self, device_index=None, sample_rate=16000, chunk_size=1024,
                 pause_threshold=0.8, min_speech_duration=0.3):
        """
        Initialize continuous listener.
        
        Args:
            device_index: Index of input device
            sample_rate: Audio sample rate
            chunk_size: Size of audio chunks to process
            pause_threshold: Duration of silence (in seconds) to mark end of speech
            min_speech_duration: Minimum duration of speech to consider valid
        """
        self.device_index = device_index
        self.sample_rate = sample_rate
        self.chunk_size = chunk_size
        self.pause_threshold = pause_threshold
        self.min_speech_duration = min_speech_duration
        
        self.stream = None
        self.audio = pyaudio.PyAudio()
        
    def __enter__(self):
        self.stream = self.audio.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=self.sample_rate,
            input=True,
            frames_per_buffer=self.chunk_size,
            input_device_index=self.device_index,
            stream_callback=self._callback
        )
        self.stream.start_stream()
        return self
    
    def __exit__(self, exc_type, exc_value, traceback):
        self.stream.stop_stream()
        self.stream.close()
        self.audio.terminate()
        
    def _callback(self, in_data, frame_count, time_info, status):
        self.audio_queue.put(in_data)
        return None, pyaudio.paContinue

class AudioChat:
    """
    A class to handle text-to-speech and continuous speech-to-text conversion.
    
    Attributes:
        lang (str): Language code for text-to-speech conversion (default: 'en')
        logger (logging.Logger): Logger instance for the class
    """
    
    def __init__(self, lang: str = 'en', 
                 pause_threshold: float = 0.8,
                 min_speech_duration: float = 0.3,
                 energy_threshold: int = 1000):
        """
        Initialize AudioChat with specified parameters.
        
        Args:
            lang (str): Language code for text-to-speech
            pause_threshold (float): Duration of silence (in seconds) to mark end of speech
            min_speech_duration (float): Minimum duration of speech to consider valid
            energy_threshold (int): Minimum audio energy to consider as speech
        """
        self.lang = lang
        self.pause_threshold = pause_threshold
        self.min_speech_duration = min_speech_duration
        self.energy_threshold = energy_threshold
        
        # Set up logging
        self.logger = logging.getLogger(__name__)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        self.logger.setLevel(logging.INFO)

    def speak_text(self, text: str) -> None:
        """Convert text to speech and play it."""
        if not isinstance(text, str) or not text.strip():
            raise ValueError("Text must be a non-empty string")
            
        try:
            self.logger.info(f"Converting text to speech: {text[:50]}...")
            mp3_fp = BytesIO()
            tts = gTTS(text=text, lang=self.lang)
            tts.write_to_fp(mp3_fp)
            mp3_fp.seek(0)
            audio = AudioSegment.from_mp3(mp3_fp)
            play(audio)
        finally:
            mp3_fp.close()

    def recognize_continuous_speech(self) -> str:
        """
        Record speech continuously until a significant pause is detected.
        
        Returns:
            str: The recognized text
            
        Raises:
            RuntimeError: If speech recognition fails
        """
        recognizer = sr.Recognizer()
        recognizer.energy_threshold = self.energy_threshold
        
        # Buffer to store audio data
        audio_data = []
        speech_started = False
        silence_start = None
        speech_start = None
        
        with sr.Microphone() as source:
            self.logger.info("Adjusting for ambient noise...")
            recognizer.adjust_for_ambient_noise(source, duration=1)
            
            self.logger.info("Listening for continuous speech...")
            
            try:
                while True:
                    audio_chunk = source.stream.read(source.CHUNK)
                    energy = audioop.rms(audio_chunk, 2)
                    
                    # Detect speech start
                    if not speech_started and energy > recognizer.energy_threshold:
                        speech_started = True
                        speech_start = time.time()
                        silence_start = None
                        self.logger.debug("Speech started")
                    
                    # Detect silence
                    if speech_started and energy < recognizer.energy_threshold:
                        if silence_start is None:
                            silence_start = time.time()
                        elif time.time() - silence_start > self.pause_threshold:
                            # Check if speech duration meets minimum requirement
                            if time.time() - speech_start > self.min_speech_duration:
                                break
                    else:
                        silence_start = None
                    
                    audio_data.append(audio_chunk)
                
                # Convert audio data to AudioData object
                audio = sr.AudioData(b''.join(audio_data), 
                                   source.SAMPLE_RATE,
                                   source.SAMPLE_WIDTH)
                
                self.logger.info("Processing speech...")
                text = recognizer.recognize_google(audio, language=self.lang)
                self.logger.info(f"Successfully recognized: {text}")
                return text
                
            except sr.UnknownValueError:
                self.logger.error("Speech was not understood")
                raise RuntimeError("Could not understand audio")
            except sr.RequestError as e:
                self.logger.error(f"Could not request results: {str(e)}")
                raise RuntimeError(f"Speech recognition service error: {str(e)}")
            except Exception as e:
                self.logger.error(f"Unexpected error: {str(e)}")
                raise RuntimeError(f"Speech recognition failed: {str(e)}")

    def have_continuous_conversation(self, initial_prompt: str = "Speak something...") -> None:
        """
        Conduct a conversation using continuous speech recognition.
        
        Args:
            initial_prompt (str): The initial prompt to speak
        """
        try:
            self.speak_text(initial_prompt)
            
            while True:
                try:
                    # Listen for user input continuously
                    user_text = self.recognize_continuous_speech()
                    
                    if user_text.lower() in ['quit', 'exit', 'stop', 'bye']:
                        self.speak_text("Goodbye!")
                        break
                    
                    # Here you can add your chatbot logic to process user_text
                    # For now, we'll just echo back
                    response = f"You said: {user_text}"
                    story = """Once upon a time in a small, vibrant village nestled between two rivers, there lived a young girl named Lira. Lira had a unique gift: she could understand the language of plants. From a young age, she'd spent countless hours wandering through the forest, listening to the trees, flowers, and even the blades of grass. They whispered secrets of the earth, told tales of seasons past, and shared wisdom that had been passed down for centuries."""
                    self.speak_text(story)
                    
                except RuntimeError as e:
                    self.speak_text("I'm having trouble understanding. Could you please repeat?")
                    
        except KeyboardInterrupt:
            self.speak_text("Conversation ended. Goodbye!")
            self.logger.info("Conversation ended by user")

def main():
    # Example usage with custom parameters
    chat = AudioChat(
        lang='en',
        pause_threshold=0.8,  # Wait for 0.8 seconds of silence before stopping
        min_speech_duration=0.5,  # Minimum speech duration to consider valid
        energy_threshold=1000  # Adjust based on your microphone and environment
    )
    # Start continuous conversation
    chat.have_continuous_conversation("Please start speaking. I'll listen until you pause.")

if __name__ == "__main__":
    main()

In [None]:
import speech_recognition as sr
from gtts import gTTS
from io import BytesIO
from pydub import AudioSegment
from pydub.playback import play
import logging
from typing import Optional
import threading
import queue
import time
import audioop
import math
import pyaudio

class AudioPlayer:
    """Handles audio playback with interruption capability."""
    def __init__(self):
        self.current_playback = None
        self.should_stop = threading.Event()
        
    def play_audio(self, audio_segment):
        """Play audio with ability to stop."""
        self.should_stop.clear()
        chunk_ms = 50  # Size of chunks to play
        
        for i in range(0, len(audio_segment), chunk_ms):
            if self.should_stop.is_set():
                break
            chunk = audio_segment[i:i + chunk_ms]
            play(chunk)
    
    def stop_playback(self):
        """Stop current audio playback."""
        self.should_stop.set()

class AudioChat:
    def __init__(self, lang='en', energy_threshold=1000):
        self.lang = lang
        self.energy_threshold = energy_threshold
        self.logger = self._setup_logger()
        self.audio_player = AudioPlayer()
        self.speech_detected = threading.Event()
        self.is_speaking = False
        
    def _setup_logger(self):
        logger = logging.getLogger(__name__)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        logger.setLevel(logging.INFO)
        return logger
    
    def _monitor_audio(self, audio_queue):
        """Continuously monitor audio input for speech."""
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16,
                       channels=1,
                       rate=16000,
                       input=True,
                       frames_per_buffer=1024,
                       stream_callback=lambda in_data, frame_count, time_info, status: 
                           (self._audio_callback(in_data), pyaudio.paContinue))
        
        stream.start_stream()
        
        try:
            while True:
                time.sleep(0.1)
        finally:
            stream.stop_stream()
            stream.close()
            p.terminate()
    
    def _audio_callback(self, in_data):
        """Process incoming audio data."""
        energy = audioop.rms(in_data, 2)
        
        if energy > self.energy_threshold and not self.is_speaking:
            self.logger.info("Speech detected!")
            self.speech_detected.set()
            self.audio_player.stop_playback()
        return in_data
    
    def speak_text(self, text: str) -> None:
        """Convert text to speech and play it."""
        if not isinstance(text, str) or not text.strip():
            raise ValueError("Text must be a non-empty string")
        
        try:
            self.logger.info(f"Converting text to speech: {text[:50]}...")
            mp3_fp = BytesIO()
            tts = gTTS(text=text, lang=self.lang)
            tts.write_to_fp(mp3_fp)
            mp3_fp.seek(0)
            audio = AudioSegment.from_mp3(mp3_fp)
            
            self.is_speaking = True
            self.audio_player.play_audio(audio)
            self.is_speaking = False
            
        finally:
            mp3_fp.close()
    
    def have_continuous_conversation(self):
        """Handle continuous conversation with interruption capability."""
        story = """Once upon a time in a small, vibrant village nestled between two rivers, 
                  there lived a young girl named Lira. Lira had a unique gift: she could 
                  understand the language of plants. From a young age, she'd spent countless 
                  hours wandering through the forest, listening to the trees, flowers, and 
                  even the blades of grass."""
        
        # Start audio monitoring in a separate thread
        audio_queue = queue.Queue()
        monitor_thread = threading.Thread(target=self._monitor_audio, 
                                       args=(audio_queue,),
                                       daemon=True)
        monitor_thread.start()
        
        try:
            while True:
                self.speech_detected.clear()
                self.speak_text(story)
                
                # Wait for speech detection
                if self.speech_detected.wait(timeout=1.0):
                    # Process the speech here
                    recognizer = sr.Recognizer()
                    with sr.Microphone() as source:
                        audio = recognizer.listen(source)
                        try:
                            text = recognizer.recognize_google(audio)
                            self.logger.info(f"Recognized: {text}")
                            
                            if text.lower() in ['quit', 'exit', 'stop', 'bye']:
                                self.speak_text("Goodbye!")
                                break
                        except sr.UnknownValueError:
                            self.logger.error("Could not understand audio")
                        except sr.RequestError as e:
                            self.logger.error(f"Could not request results: {str(e)}")
                
        except KeyboardInterrupt:
            self.logger.info("Conversation ended by user")
            self.speak_text("Goodbye!")

def main():
    chat = AudioChat(energy_threshold=1000)
    chat.have_continuous_conversation()

if __name__ == "__main__":
    main()

In [None]:
import speech_recognition as sr
from gtts import gTTS
from io import BytesIO
import pyaudio
import numpy as np
import threading
import queue
import sounddevice as sd
import soundfile as sf
import time
import logging
from tempfile import NamedTemporaryFile
import os

class SmoothAudioChat:
    def __init__(self, sample_rate=44100, cooldown_time=0.5):
        self.sample_rate = sample_rate
        self.audio_queue = queue.Queue()
        self.is_playing = False
        self.should_stop = False
        self.is_speaking = False
        self.cooldown_time = cooldown_time  # Cooldown after playback to avoid feedback
        self.playback_lock = threading.Lock()
        
        # Configure logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Initialize PyAudio
        self.p = pyaudio.PyAudio()
        
        # Audio detection parameters
        self.silence_threshold = 300  # Adjust based on your microphone
        self.speech_buffer = []
        self.speech_detected = False

    def _audio_callback(self, indata, frames, time, status):
        """Handle incoming audio data"""
        if status:
            self.logger.warning(f"Audio input status: {status}")
        
        # Calculate audio energy
        energy = np.sqrt(np.mean(indata**2))
        
        # Check for speech detection
        if energy > self.silence_threshold and not self.is_speaking:
            if not self.speech_detected:
                self.speech_detected = True
                self.stop_playback()  # Stop playback if speech detected
                self.speech_buffer = []
            self.speech_buffer.extend(indata.flatten())
            
        elif self.speech_detected and energy <= self.silence_threshold:
            # Process captured speech
            self._process_speech()
            self.speech_detected = False

    def _process_speech(self):
        """Process captured speech buffer"""
        if not self.speech_buffer:
            return
            
        # Save speech buffer to temporary WAV file
        with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            sf.write(temp_file.name, self.speech_buffer, self.sample_rate)
        
        # Use speech recognition
        recognizer = sr.Recognizer()
        with sr.AudioFile(temp_file.name) as source:
            try:
                audio = recognizer.record(source)
                text = recognizer.recognize_google(audio)
                self.logger.info(f"Recognized: {text}")
                
                if text.lower() in ['quit', 'exit', 'stop', 'bye']:
                    self.should_stop = True
                    return
                
                response = f"You said: {text}"
                audio_data, sample_rate = self.text_to_speech(response)
                self.play_audio(audio_data, sample_rate)
                    
            except sr.UnknownValueError:
                self.logger.info("Speech not understood")
            except sr.RequestError as e:
                self.logger.error(f"Recognition error: {e}")
            finally:
                os.unlink(temp_file.name)

    def start_listening(self):
        """Start continuous audio input monitoring"""
        try:
            with sd.InputStream(callback=self._audio_callback,
                                channels=1,
                                samplerate=self.sample_rate):
                while not self.should_stop:
                    time.sleep(0.1)
                    
        except Exception as e:
            self.logger.error(f"Error in audio input stream: {e}")
            
    def play_audio(self, audio_data, sample_rate):
        """Play audio data with smooth playback"""
        with self.playback_lock:
            self.is_speaking = True
            sd.play(audio_data, sample_rate)
            sd.wait()
            self.is_speaking = False
            # Cooldown period to avoid feedback
            time.sleep(self.cooldown_time)
            
    def stop_playback(self):
        """Stop current audio playback if running"""
        with self.playback_lock:
            sd.stop()
            self.is_speaking = False
            
    def text_to_speech(self, text):
        """Convert text to speech with high quality"""
        with NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
            # Generate speech with gTTS
            tts = gTTS(text=text, lang='en')
            tts.save(temp_file.name)
            
            # Load audio file with better quality
            audio_data, sample_rate = sf.read(temp_file.name)
            
            # Remove temporary file
            os.unlink(temp_file.name)
            
            return audio_data, sample_rate
            
    def run_conversation(self):
        """Run the main conversation loop, listening for user input and repeating recognized speech"""
        # Start listening thread
        listen_thread = threading.Thread(target=self.start_listening, daemon=True)
        listen_thread.start()
        
        try:
            while not self.should_stop:
                time.sleep(0.1)  # Small delay to prevent CPU overuse
                
        except KeyboardInterrupt:
            self.logger.info("Conversation ended by user")
        finally:
            self.should_stop = True
            self.stop_playback()
            
    def cleanup(self):
        """Clean up resources"""
        self.p.terminate()

def main():
    audio_chat = SmoothAudioChat()
    try:
        audio_chat.run_conversation()
    finally:
        audio_chat.cleanup()

if __name__ == "__main__":
    main()


In [None]:
import speech_recognition as sr
from gtts import gTTS
from io import BytesIO
import pyaudio
import numpy as np
import threading
import queue
import sounddevice as sd
import soundfile as sf
import time
import logging
from tempfile import NamedTemporaryFile
import os

class SmoothAudioChat:
    def __init__(self, sample_rate=44100, cooldown_time=0.5):
        self.sample_rate = sample_rate
        self.audio_queue = queue.Queue()
        self.is_playing = False
        self.should_stop = False
        self.is_speaking = False
        self.cooldown_time = cooldown_time  # Cooldown after playback to avoid feedback
        self.playback_lock = threading.Lock()
        
        # Configure logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Initialize PyAudio
        self.p = pyaudio.PyAudio()
        
        # Audio detection parameters
        self.silence_threshold = 300  # Adjust based on your microphone
        self.speech_buffer = []
        self.speech_detected = False

    def _audio_callback(self, indata, frames, time, status):
        """Handle incoming audio data"""
        if status:
            self.logger.warning(f"Audio input status: {status}")
        
        # Calculate audio energy
        energy = np.sqrt(np.mean(indata**2))
        
        # Check for speech detection
        if energy > self.silence_threshold and not self.is_speaking:
            if not self.speech_detected:
                self.speech_detected = True
                self.stop_playback()  # Stop playback if speech detected
                self.speech_buffer = []
            self.speech_buffer.extend(indata.flatten())
            
        elif self.speech_detected and energy <= self.silence_threshold:
            # Process captured speech
            self._process_speech()
            self.speech_detected = False

    def start_listening(self):
        """Start continuous audio input monitoring"""
        try:
            with sd.InputStream(callback=self._audio_callback,
                                channels=1,
                                samplerate=self.sample_rate):
                while not self.should_stop:
                    time.sleep(0.1)
                    
        except Exception as e:
            self.logger.error(f"Error in audio input stream: {e}")
            
            
    def stop_playback(self):
        """Stop current audio playback if running"""
        with self.playback_lock:
            sd.stop()
            self.is_speaking = False
            
    def text_to_speech(self, text):
        """Convert text to speech with high quality"""
        with NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
            # Generate speech with gTTS
            tts = gTTS(text=text, lang='en')
            tts.save(temp_file.name)
            
            # Load audio file with better quality
            audio_data, sample_rate = sf.read(temp_file.name)
            
            # Remove temporary file
            os.unlink(temp_file.name)
            
            return audio_data, sample_rate
            
    def run_conversation(self):
        """Run the main conversation loop, listening for user input and repeating recognized speech"""
        # Start listening thread
        listen_thread = threading.Thread(target=self.start_listening, daemon=True)
        listen_thread.start()
        
        try:
            while not self.should_stop:
                time.sleep(0.1)  # Small delay to prevent CPU overuse
                
        except KeyboardInterrupt:
            self.logger.info("Conversation ended by user")
        finally:
            self.should_stop = True
            self.stop_playback()
    
    def _process_speech(self):
        """Process captured speech buffer"""
        if not self.speech_buffer:
            return
            
        # Save speech buffer to temporary WAV file
        with NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            sf.write(temp_file.name, self.speech_buffer, self.sample_rate)
        
        # Use speech recognition
        recognizer = sr.Recognizer()
        with sr.AudioFile(temp_file.name) as source:
            try:
                audio = recognizer.record(source)
                text = recognizer.recognize_google(audio)
                self.logger.info(f"Recognized: {text}")
                
                if text.lower() in ['quit', 'exit', 'stop', 'bye']:
                    self.should_stop = True
                    return
                
                response = f"You said: {text}"
                self.logger.info(f"Response text: {response}")
                
                audio_data, sample_rate = self.text_to_speech(response)
                self.logger.info("Text-to-speech conversion completed.")
                
                # Play response
                self.play_audio(audio_data, sample_rate)
                    
            except sr.UnknownValueError:
                self.logger.info("Speech not understood")
            except sr.RequestError as e:
                self.logger.error(f"Recognition error: {e}")
            finally:
                os.unlink(temp_file.name)

    def play_audio(self, audio_data, sample_rate):
        """Play audio data with smooth playback"""
        with self.playback_lock:
            self.is_speaking = True
            self.logger.info("Starting playback.")
            sd.play(audio_data, sample_rate)
            sd.wait()
            self.logger.info("Playback finished.")
            self.is_speaking = False
            # Cooldown period to avoid feedback
            time.sleep(self.cooldown_time)


    def cleanup(self):
        """Clean up resources"""
        self.p.terminate()

def main():
    audio_chat = SmoothAudioChat()
    try:
        audio_chat.run_conversation()
    finally:
        audio_chat.cleanup()

if __name__ == "__main__":
    main()

In [1]:
import speech_recognition as sr
from gtts import gTTS
from io import BytesIO
import pygame
import threading
import queue
import logging
import time
import re
import signal
import os
import numpy as np
from threading import Event, Lock
from langchain_ollama import ChatOllama
import webrtcvad
from langdetect import detect
from collections import deque
import sounddevice as sd

class VoiceChatSystem:
    def __init__(self, model_name="llama2:13b"):
        self.logger = self._setup_logger()
        self.recognizer = sr.Recognizer()
        self.energy_threshold = 300  # Lowered threshold for better detection
        self.text_queue = queue.Queue()
        self.sentence_queue = queue.Queue()
        self.audio_queue = deque(maxlen=10)  # Queue for batch audio processing
        self.is_listening = True
        self.current_task_id = 0
        self.interrupt_event = Event()
        self.speaking_event = Event()
        self.speaking_lock = Lock()
        self.is_system_speaking = False
        self.last_system_audio_end = 0
        self.silence_duration = 0.8  # Adjusted for better pause detection
        self.current_sound = None
        
        # Initialize audio settings
        pygame.mixer.init(frequency=16000, channels=2)
        pygame.mixer.set_num_channels(4)
        sd.default.samplerate = 16000
        sd.default.channels = 2
        
        self.vad = webrtcvad.Vad(2)  # Reduced aggressiveness for better detection
        self.model = ChatOllama(model=model_name)

    def _setup_logger(self):
        logger = logging.getLogger('VoiceChatSystem')
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        logger.setLevel(logging.INFO)
        return logger

    def is_valid_human_speech(self, audio_data, timestamp):
        try:
            raw_data = np.frombuffer(audio_data.frame_data, dtype=np.int16)
            
            # Calculate audio energy
            audio_energy = np.abs(raw_data).mean()
            
            # Time since last system audio ended
            time_since_system_audio = timestamp - self.last_system_audio_end
            
            # Adjust energy threshold based on system speech
            energy_threshold = self.energy_threshold
            if self.is_system_speaking or time_since_system_audio < 0.2:
                energy_threshold *= 2
            
            if audio_energy < energy_threshold:
                return False
            
            # VAD analysis
            frame_duration = 30  # ms
            frames = len(raw_data) // (16000 * frame_duration // 1000)
            
            if frames == 0:
                return False
                
            speech_frames = 0
            for i in range(frames):
                start = i * (16000 * frame_duration // 1000)
                end = start + (16000 * frame_duration // 1000)
                frame = raw_data[start:end].tobytes()
                
                try:
                    if self.vad.is_speech(frame, 16000):
                        speech_frames += 1
                except:
                    continue
            
            speech_ratio = speech_frames / frames
            min_speech_ratio = 0.3
            
            if self.is_system_speaking or time_since_system_audio < 0.2:
                min_speech_ratio = 0.5
            
            return speech_ratio > min_speech_ratio
            
        except Exception as e:
            self.logger.error(f"Error in speech validation: {str(e)}")
            return False

    def speak_text(self, text, task_id):
        if task_id != self.current_task_id or not text.strip():
            return False
        
        try:
            if self.interrupt_event.is_set():
                return False
                
            with self.speaking_lock:
                self.is_system_speaking = True
                self.speaking_event.set()
                
                lang = self.detect_language(text)
                mp3_fp = BytesIO()
                tts = gTTS(text=text, lang=lang)
                tts.write_to_fp(mp3_fp)
                mp3_fp.seek(0)
                
                temp_file = f"temp_audio_{task_id}_{time.time()}.mp3"
                try:
                    with open(temp_file, 'wb') as f:
                        f.write(mp3_fp.getvalue())
                    
                    self.current_sound = pygame.mixer.Sound(temp_file)
                    channel = pygame.mixer.find_channel()
                    
                    if channel is None:
                        # Force stop all channels if none available
                        for i in range(pygame.mixer.get_num_channels()):
                            pygame.mixer.Channel(i).stop()
                        channel = pygame.mixer.Channel(0)
                    
                    channel.play(self.current_sound)
                    
                    while channel.get_busy() and not self.interrupt_event.is_set():
                        pygame.time.wait(10)
                    
                    channel.stop()
                    self.current_sound = None
                    
                finally:
                    if os.path.exists(temp_file):
                        os.remove(temp_file)
                
                self.last_system_audio_end = time.time()
                return not self.interrupt_event.is_set()
                
        finally:
            self.is_system_speaking = False
            self.speaking_event.clear()
            mp3_fp.close()

    def _clear_queues(self):
        """Clear all queues"""
        try:
            # Clear text queue
            while True:
                self.text_queue.get_nowait()
                self.text_queue.task_done()
        except queue.Empty:
            pass

        try:
            # Clear sentence queue
            while True:
                self.sentence_queue.get_nowait()
                self.sentence_queue.task_done()
        except queue.Empty:
            pass
        
        self.logger.debug("Queues cleared")

    def immediate_interrupt(self):
        self.interrupt_event.set()
        
        with self.speaking_lock:
            # Stop current sound if exists
            if self.current_sound is not None:
                for i in range(pygame.mixer.get_num_channels()):
                    pygame.mixer.Channel(i).stop()
                self.current_sound = None
            
            # Clear audio queue
            self.audio_queue.clear()
            
            # Clear all other queues
            self._clear_queues()
        
        self.interrupt_event.clear()
        self.is_system_speaking = False
        self.last_system_audio_end = time.time()
        self.logger.debug("System interrupted and all queues cleared")

    def listen_continuously(self):
        with sr.Microphone() as source:
            self.logger.info("Adjusting for ambient noise...")
            self.recognizer.dynamic_energy_threshold = True
            self.recognizer.energy_threshold = self.energy_threshold
            self.recognizer.adjust_for_ambient_noise(source, duration=1)
            self.logger.info("Ready to listen!")
            
            while self.is_listening:
                try:
                    self.logger.info("Listening...")
                    audio = self.recognizer.listen(source, phrase_time_limit=3)
                    current_time = time.time()
                    
                    if not self.is_valid_human_speech(audio, current_time):
                        continue
                    
                    # Interrupt and clear queues if new speech detected while system is speaking
                    # or there's pending text in the queue
                    if self.is_system_speaking or not self.text_queue.empty():
                        self.immediate_interrupt()
                    
                    text = None
                    for lang in ['en', 'hi', 'kn']:
                        try:
                            text = self.recognizer.recognize_google(audio, language=lang)
                            if text:
                                break
                        except:
                            continue
                    
                    if not text:
                        continue
                        
                    if text.lower() in ['quit', 'exit', 'stop', 'bye']:
                        self.stop_system()
                        break
                    
                    self.logger.info(f"Recognized: {text}")
                    
                    self.current_task_id += 1
                    self.text_queue.put((text, self.current_task_id))
                    
                except sr.UnknownValueError:
                    continue
                except sr.RequestError as e:
                    self.logger.error(f"Could not request results: {str(e)}")
                    continue
                except Exception as e:
                    self.logger.error(f"Error in listening: {str(e)}")
                    continue

    def process_text(self):
        while self.is_listening:
            try:
                text, task_id = self.text_queue.get(timeout=1)
                
                if self.interrupt_event.is_set():
                    continue
                
                current_sentence = ""
                stream = self.model.stream(text)
                
                for chunk in stream:
                    if self.interrupt_event.is_set() or task_id != self.current_task_id:
                        break
                    
                    current_sentence += chunk.content
                    sentences = re.split(r'([.!?]+)', current_sentence)
                    
                    while len(sentences) >= 2 and not self.interrupt_event.is_set():
                        sentence = sentences.pop(0) + sentences.pop(0)
                        if sentence.strip():
                            self.sentence_queue.put((sentence, task_id))
                    
                    current_sentence = ''.join(sentences)
                
                if current_sentence.strip() and not self.interrupt_event.is_set():
                    self.sentence_queue.put((current_sentence, task_id))
                
            except queue.Empty:
                continue
            except Exception as e:
                self.logger.error(f"Error in processing: {str(e)}")

    def speak_responses(self):
        while self.is_listening:
            try:
                batch_text = ""
                current_task_id = None
                
                # Collect sentences for batch processing
                try:
                    while len(batch_text.split()) < 50:  # Limit batch size
                        sentence, task_id = self.sentence_queue.get_nowait()
                        
                        if current_task_id is None:
                            current_task_id = task_id
                        
                        if task_id != current_task_id:
                            # If task ID changes, process current batch first
                            if batch_text.strip():
                                self.audio_queue.append((batch_text.strip(), current_task_id))
                            batch_text = sentence
                            current_task_id = task_id
                        else:
                            batch_text += " " + sentence
                        
                except queue.Empty:
                    if batch_text.strip():
                        self.audio_queue.append((batch_text.strip(), current_task_id))
                
                # Process audio queue
                while self.audio_queue and not self.interrupt_event.is_set():
                    text, task_id = self.audio_queue.popleft()
                    if not self.speak_text(text, task_id):
                        break
                
                time.sleep(0.1)
                
            except Exception as e:
                self.logger.error(f"Error in speaking: {str(e)}")

    def detect_language(self, text):
        try:
            detected_lang = detect(text)
            return detected_lang
        except:
            return 'en'

    def stop_system(self):
        self.logger.info("Stopping system...")
        self.is_listening = False
        self.immediate_interrupt()
        pygame.mixer.quit()

    def start(self):
        self.logger.info("Starting voice chat system...")
        threads = [
            threading.Thread(target=self.listen_continuously, name="ListenThread"),
            threading.Thread(target=self.process_text, name="ProcessThread"),
            threading.Thread(target=self.speak_responses, name="SpeakThread")
        ]
        
        for thread in threads:
            thread.start()
        
        try:
            while self.is_listening:
                time.sleep(0.1)
        except KeyboardInterrupt:
            self.logger.info("Keyboard interrupt detected, stopping...")
            self.stop_system()
        
        for thread in threads:
            thread.join()
        
        self.logger.info("System stopped")

def main():
    system = VoiceChatSystem(model_name="llama3.2:1b")
    system.start()

if __name__ == "__main__":
    main()

pygame 2.6.0 (SDL 2.28.4, Python 3.12.6)
Hello from the pygame community. https://www.pygame.org/contribute.html


2024-11-10 14:51:01,712 - INFO - Starting voice chat system...
2024-11-10 14:51:01,726 - INFO - Adjusting for ambient noise...
2024-11-10 14:51:02,758 - INFO - Ready to listen!
2024-11-10 14:51:02,759 - INFO - Listening...
2024-11-10 14:51:05,946 - INFO - Recognized: hello
2024-11-10 14:51:05,948 - INFO - Listening...
2024-11-10 14:51:11,739 - INFO - Listening...
2024-11-10 14:51:15,098 - INFO - Recognized: tell me a story
2024-11-10 14:51:15,100 - INFO - Listening...
2024-11-10 14:51:20,296 - INFO - Recognized: tell me a story
2024-11-10 14:51:20,297 - INFO - Listening...
2024-11-10 14:51:26,181 - INFO - Recognized: tell me a story in Hindi
2024-11-10 14:51:26,181 - INFO - Listening...
2024-11-10 14:51:34,117 - INFO - Listening...
2024-11-10 14:51:52,576 - INFO - Recognized: tell me about AI
2024-11-10 14:51:52,578 - INFO - Listening...
2024-11-10 14:52:12,236 - INFO - Recognized: tell me about machine learning
2024-11-10 14:52:12,238 - INFO - Listening...
2024-11-10 14:52:17,059 - IN