In [None]:
#pip install sounddevice scipy openai-whisper bark

In [2]:
import torch
from transformers import AutoProcessor, BarkModel
import whisper
import sounddevice as sd
import scipy.io.wavfile as wavfile
import numpy as np
from IPython.display import clear_output
import time
import warnings
import os
warnings.filterwarnings('ignore')

class VoiceAssistant:
    def __init__(self, whisper_model_path="small.pt"):
        print("Initializing voice assistant... This might take a minute...")
        
        # Initialize Bark
        self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
        self.bark_model = BarkModel.from_pretrained("suno/bark")
        self.voice_preset = "v2/en_speaker_6"
        
        # Initialize local Whisper model
        self.whisper_model = whisper.load_model(whisper_model_path)
        
        # Set device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.bark_model.to(self.device)
        
        print("Initialization complete! Ready to chat!")
    
    def speak(self, text):
        """Convert text to speech using Bark"""
        inputs = self.bark_processor(text, voice_preset=self.voice_preset)
        for k, v in inputs.items():
            inputs[k] = v.to(self.device)
            
        audio_array = self.bark_model.generate(**inputs)
        audio_array = audio_array.cpu().numpy().squeeze()
        
        # Normalize audio
        audio_array = audio_array / np.max(np.abs(audio_array))
        
        # Play audio
        sd.play(audio_array, samplerate=24000)
        sd.wait()
    
    def listen(self, duration=5, sample_rate=16000):
        """Record audio and convert to text using local Whisper"""
        print("Listening...")
        
        # Record audio
        recording = sd.rec(int(duration * sample_rate), 
                         samplerate=sample_rate, 
                         channels=1)
        sd.wait()
        
        # Save recording temporarily
        wavfile.write("temp_recording.wav", sample_rate, recording)
        
        # Transcribe with local Whisper model
        result = self.whisper_model.transcribe("temp_recording.wav")
        transcription = result["text"]
        
        return transcription.strip().lower()
    
    def chat(self):
        """Main conversation loop"""
        self.speak("Hello! I'm your voice assistant. How can I help you today?")
        
        while True:
            try:
                # Listen to user input
                user_input = self.listen()
                print(f"You said: {user_input}")
                
                # Check for exit commands
                if user_input in ['goodbye', 'bye', 'exit', 'quit']:
                    self.speak("Goodbye! Have a great day!")
                    break
                
                # Generate response (you can make this more sophisticated)
                response = f"You said: {user_input}"
                print(f"Assistant: {response}")
                self.speak(response)
                
                clear_output(wait=True)
                
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                break

# Usage example
if __name__ == "__main__":
    # Initialize with path to your local Whisper model
    model_path_whisper = os.path.expanduser("~/ml/models/whisper_models/small.pt")
    assistant = VoiceAssistant(whisper_model_path=model_path_whisper)
    assistant.chat()

Initializing voice assistant... This might take a minute...
Initialization complete! Ready to chat!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


Listening...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You said: hi, how are you? what do you do?
Assistant: You said: hi, how are you? what do you do?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


KeyboardInterrupt: 