In [None]:
#pip install sounddevice scipy openai-whisper bark

In [9]:
import sounddevice as sd
from scipy.io.wavfile import write
import whisper
from bark import SAMPLE_RATE, generate_audio
import numpy as np
import wave
import os
from bark.generation import preload_models

# Define model directories
model_dir_whisper = os.path.expanduser("~/ml/models/whisper_models")
model_dir_bark = os.path.expanduser("~/ml/models/bark_models")

def load_models():
    """Load Whisper and Bark models from local directories"""
    print("Loading models locally...")
    try:
        whisper_model = whisper.load_model("small", download_root=model_dir_whisper)
        return whisper_model
    except Exception as e:
        print(f"Error loading models: {e}")
        return None

def record_audio(filename="input_audio.wav", duration=5, samplerate=16000):
    """Record audio from microphone"""
    print("Listening... Speak now!")
    try:
        # Ensure the audio data is float32 for better compatibility
        audio_data = sd.rec(int(duration * samplerate), 
                          samplerate=samplerate, 
                          channels=1, 
                          dtype='float32')
        sd.wait()
        print("Recording finished.")
        
        # Convert float32 to int16 for WAV file
        audio_data_int = (audio_data * 32767).astype(np.int16)
        write(filename, samplerate, audio_data_int)
        return filename
    except Exception as e:
        print(f"Error recording audio: {e}")
        return None

def transcribe_audio(filename, whisper_model):
    """Transcribe audio using Whisper"""
    print("Transcribing...")
    try:
        result = whisper_model.transcribe(filename)
        return result["text"].strip()
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return ""

def speak_text(text, output_file="output_audio.wav"):
    """Generate speech using Bark"""
    print("Generating response...")
    try:
        # Generate audio using Bark
        audio_array = generate_audio(text)
        
        # Save the generated audio
        with wave.open(output_file, "wb") as wf:
            wf.setnchannels(1)  # Mono audio
            wf.setsampwidth(2)  # 16-bit audio
            wf.setframerate(SAMPLE_RATE)  # Use Bark's sample rate constant
            # Convert float32 to int16
            audio_int16 = (audio_array * 32767).astype(np.int16)
            wf.writeframes(audio_int16.tobytes())
        
        print(f"Response saved as {output_file}")
        return output_file
    except Exception as e:
        print(f"Error generating speech: {e}")
        return None

def play_audio(file_path):
    """Play audio file"""
    try:
        with wave.open(file_path, 'rb') as wf:
            rate = wf.getframerate()
            data = wf.readframes(wf.getnframes())
            audio_data = np.frombuffer(data, dtype=np.int16)
            sd.play(audio_data, samplerate=rate, blocking=True)
    except Exception as e:
        print(f"Error playing audio: {e}")

def conversational_assistant():
    """Main conversational loop"""
    print("Welcome to your fully local conversational assistant!")
    print("Initializing...")
    
    # Create model directories if they don't exist
    os.makedirs(model_dir_whisper, exist_ok=True)
    os.makedirs(model_dir_bark, exist_ok=True)
    
    # Load models
    whisper_model = load_models()
    if whisper_model is None:
        print("Failed to load models. Exiting...")
        return
    
    print("Ready! Press Ctrl+C to exit.")
    try:
        while True:
            # Record user input
            input_audio = record_audio(duration=5)
            if input_audio is None:
                continue
            
            # Transcribe audio
            user_input = transcribe_audio(input_audio, whisper_model)
            if not user_input:
                continue
            
            print(f"User said: {user_input}")
            
            # Check for exit condition
            if user_input.lower() in ["exit", "quit", "bye", "goodbye"]:
                print("Goodbye!")
                break
            
            # Generate and speak response
            response_text = f"You said: {user_input}. How can I assist you further?"
            response_audio = speak_text(response_text)
            if response_audio:
                play_audio(response_audio)
    
    except KeyboardInterrupt:
        print("\nExiting...")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        print("Thank you for using the local voice assistant!")

if __name__ == "__main__":
    conversational_assistant()

Welcome to your fully local conversational assistant!
Initializing...
Loading models locally...
Ready! Press Ctrl+C to exit.
Listening... Speak now!
Recording finished.
Transcribing...
Error transcribing audio: [Errno 2] No such file or directory: 'ffmpeg'
Listening... Speak now!




Recording finished.
Transcribing...
Error transcribing audio: [Errno 2] No such file or directory: 'ffmpeg'
Listening... Speak now!




Recording finished.
Transcribing...
Error transcribing audio: [Errno 2] No such file or directory: 'ffmpeg'
Listening... Speak now!





Exiting...
Thank you for using the local voice assistant!
