In [None]:
#pip install sounddevice scipy openai-whisper bark

In [None]:
import torch
from transformers import AutoProcessor, BarkModel
import whisper
import sounddevice as sd
import scipy.io.wavfile as wavfile
import numpy as np
from IPython.display import clear_output
import time
import warnings
warnings.filterwarnings('ignore')

class VoiceAssistant:
    def __init__(self, whisper_model_path="small.pt"):
        print("Initializing voice assistant... This might take a minute...")
        
        # Initialize Bark
        self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
        self.bark_model = BarkModel.from_pretrained("suno/bark")
        self.voice_preset = "v2/en_speaker_6"
        
        # Initialize local Whisper model
        self.whisper_model = whisper.load_model(whisper_model_path)
        
        # Set device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.bark_model.to(self.device)
        
        print("Initialization complete! Ready to chat!")
    
    def speak(self, text):
        """Convert text to speech using Bark"""
        inputs = self.bark_processor(text, voice_preset=self.voice_preset)
        for k, v in inputs.items():
            inputs[k] = v.to(self.device)
            
        audio_array = self.bark_model.generate(**inputs)
        audio_array = audio_array.cpu().numpy().squeeze()
        
        # Normalize audio
        audio_array = audio_array / np.max(np.abs(audio_array))
        
        # Play audio
        sd.play(audio_array, samplerate=24000)
        sd.wait()
    
    def listen(self, duration=5, sample_rate=16000):
        """Record audio and convert to text using local Whisper"""
        print("Listening...")
        
        # Record audio
        recording = sd.rec(int(duration * sample_rate), 
                         samplerate=sample_rate, 
                         channels=1)
        sd.wait()
        
        # Save recording temporarily
        wavfile.write("temp_recording.wav", sample_rate, recording)
        
        # Transcribe with local Whisper model
        result = self.whisper_model.transcribe("temp_recording.wav")
        transcription = result["text"]
        
        return transcription.strip().lower()
    
    def chat(self):
        """Main conversation loop"""
        self.speak("Hello! I'm your voice assistant. How can I help you today?")
        
        while True:
            try:
                # Listen to user input
                user_input = self.listen()
                print(f"You said: {user_input}")
                
                # Check for exit commands
                if user_input in ['goodbye', 'bye', 'exit', 'quit']:
                    self.speak("Goodbye! Have a great day!")
                    break
                
                # Generate response (you can make this more sophisticated)
                response = f"You said: {user_input}"
                print(f"Assistant: {response}")
                self.speak(response)
                
                clear_output(wait=True)
                
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                break

# Usage example
if __name__ == "__main__":
    # Initialize with path to your local Whisper model
    model_path_whisper = os.path.expanduser("~/ml/models/whisper_models/small.pt")
    assistant = VoiceAssistant(whisper_model_path=model_path_whisper)
    assistant.chat()

Initializing voice assistant... This might take a minute...
Initialization complete! Ready to chat!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [2]:
import torch
from transformers import AutoProcessor, BarkModel, WhisperProcessor, WhisperForConditionalGeneration
import sounddevice as sd
import scipy.io.wavfile as wavfile
import numpy as np
from IPython.display import clear_output
import time
import wave
import warnings
warnings.filterwarnings('ignore')

class VoiceAssistant:
    def __init__(self):
        print("Initializing voice assistant... This might take a minute...")
        
        # Initialize Bark
        self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
        self.bark_model = BarkModel.from_pretrained("suno/bark")
        self.voice_preset = "v2/en_speaker_6"
        
        # Initialize Whisper
        self.whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
        self.whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
        
        # Set device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.whisper_model.to(self.device)
        self.bark_model.to(self.device)
        
        print("Initialization complete! Ready to chat!")
    
    def speak(self, text):
        """Convert text to speech using Bark"""
        inputs = self.bark_processor(text, voice_preset=self.voice_preset)
        for k, v in inputs.items():
            inputs[k] = v.to(self.device)
            
        audio_array = self.bark_model.generate(**inputs)
        audio_array = audio_array.cpu().numpy().squeeze()
        
        # Normalize audio
        audio_array = audio_array / np.max(np.abs(audio_array))
        
        # Play audio
        sd.play(audio_array, samplerate=24000)
        sd.wait()
    
    def listen(self, duration=5, sample_rate=16000):
        """Record audio and convert to text using Whisper"""
        print("Listening...")
        
        # Record audio
        recording = sd.rec(int(duration * sample_rate), 
                         samplerate=sample_rate, 
                         channels=1)
        sd.wait()
        
        # Save recording temporarily
        wavfile.write("temp_recording.wav", sample_rate, recording)
        
        # Load audio for Whisper
        with wave.open("temp_recording.wav", 'rb') as audio_file:
            audio = audio_file.readframes(audio_file.getnframes())
            audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / 32768.0
        
        # Process with Whisper
        input_features = self.whisper_processor(
            audio_array, 
            sampling_rate=sample_rate, 
            return_tensors="pt"
        ).input_features.to(self.device)
        
        # Generate token ids
        predicted_ids = self.whisper_model.generate(input_features)
        
        # Decode token ids to text
        transcription = self.whisper_processor.batch_decode(
            predicted_ids, 
            skip_special_tokens=True
        )[0]
        
        return transcription.strip().lower()
    
    def chat(self):
        """Main conversation loop"""
        self.speak("Hello! I'm your voice assistant. How can I help you today?")
        
        while True:
            try:
                # Listen to user input
                user_input = self.listen()
                print(f"You said: {user_input}")
                
                # Check for exit commands
                if user_input in ['goodbye', 'bye', 'exit', 'quit']:
                    self.speak("Goodbye! Have a great day!")
                    break
                
                # Generate response (you can make this more sophisticated)
                response = f"You said: {user_input}"
                print(f"Assistant: {response}")
                self.speak(response)
                
                clear_output(wait=True)
                
            except Exception as e:
                print(f"An error occurred: {str(e)}")
                break

# Usage example
if __name__ == "__main__":
    assistant = VoiceAssistant()
    assistant.chat()

Initializing voice assistant... This might take a minute...


OSError: /Users/hissain/ml/models/whisper_models does not appear to have a file named preprocessor_config.json. Checkout 'https://huggingface.co//Users/hissain/ml/models/whisper_models/tree/main' for available files.

In [1]:
import sounddevice as sd
from scipy.io.wavfile import write
import whisper
from bark import SAMPLE_RATE, generate_audio
import numpy as np
import wave
import os
from bark.generation import preload_models

model_dir_whisper = os.path.expanduser("~/ml/models/whisper_models")
model_dir_bark = os.path.expanduser("~/ml/models/bark_models")

def load_models():
    print("Loading models locally...")
    try:
        whisper_model = whisper.load_model("small", download_root=model_dir_whisper)
        return whisper_model
    except Exception as e:
        print(f"Error loading models: {e}")
        return None

def record_audio(filename="input_audio.wav", duration=5, samplerate=16000):
    print("Listening... Speak now!")
    try:
        # Ensure the audio data is float32 for better compatibility
        audio_data = sd.rec(int(duration * samplerate), 
                          samplerate=samplerate, 
                          channels=1, 
                          dtype='float32')
        sd.wait()
        print("Recording finished.")
        
        # Convert float32 to int16 for WAV file
        audio_data_int = (audio_data * 32767).astype(np.int16)
        write(filename, samplerate, audio_data_int)
        return filename
    except Exception as e:
        print(f"Error recording audio: {e}")
        return None

def transcribe_audio(filename, whisper_model):
    print("Transcribing...")
    try:
        result = whisper_model.transcribe(filename)
        return result["text"].strip()
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return ""

def speak_text(text, output_file="output_audio.wav"):
    print("Generating response...")
    try:
        audio_array = generate_audio(text)
        
        with wave.open(output_file, "wb") as wf:
            wf.setnchannels(1)  # Mono audio
            wf.setsampwidth(2)  # 16-bit audio
            wf.setframerate(SAMPLE_RATE)  # Use Bark's sample rate constant
            audio_int16 = (audio_array * 32767).astype(np.int16)
            wf.writeframes(audio_int16.tobytes())
        
        print(f"Response saved as {output_file}")
        return output_file
    except Exception as e:
        print(f"Error generating speech: {e}")
        return None

def play_audio(file_path):
    """Play audio file"""
    try:
        with wave.open(file_path, 'rb') as wf:
            rate = wf.getframerate()
            data = wf.readframes(wf.getnframes())
            audio_data = np.frombuffer(data, dtype=np.int16)
            sd.play(audio_data, samplerate=rate, blocking=True)
    except Exception as e:
        print(f"Error playing audio: {e}")

def conversational_assistant():
    print("Welcome to your fully local conversational assistant!")
    print("Initializing...")
    
    os.makedirs(model_dir_whisper, exist_ok=True)
    os.makedirs(model_dir_bark, exist_ok=True)
    
    # Load models
    whisper_model = load_models()
    if whisper_model is None:
        print("Failed to load models. Exiting...")
        return
    
    print("Ready! Press Ctrl+C to exit.")
    try:
        while True:
            # Record user input
            input_audio = record_audio(duration=5)
            if input_audio is None:
                continue
            
            # Transcribe audio
            user_input = transcribe_audio(input_audio, whisper_model)
            if not user_input:
                continue
            
            print(f"User said: {user_input}")
            
            if user_input.lower() in ["exit", "quit", "bye", "goodbye"]:
                print("Goodbye!")
                break
            
            # Generate and speak response
            response_text = f"You said: {user_input}. How can I assist you further?"
            response_audio = speak_text(response_text)
            if response_audio:
                play_audio(response_audio)
    
    except KeyboardInterrupt:
        print("\nExiting...")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        print("Thank you for using the local voice assistant!")

if __name__ == "__main__":
    conversational_assistant()

Welcome to your fully local conversational assistant!
Initializing...
Loading models locally...
Ready! Press Ctrl+C to exit.
Listening... Speak now!
Recording finished.
Transcribing...


No GPU being used. Careful, inference might be very slow!


User said: Hello. How are you?
Generating response...


text_2.pt:   0%|          | 0.00/5.35G [00:00<?, ?B/s]


Exiting...
Thank you for using the local voice assistant!
