In [12]:
from gtts import gTTS
import os

def commands_to_speech_files(commands, output_dir="assistant_audio", lang="en"):
    # Create output directory if not exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i, (audio_type, command) in enumerate(commands, start=1):
        filename = os.path.join(output_dir, f"command_{i}_{audio_type.replace(' ', '_')}.mp3")
        tts = gTTS(text=command, lang=lang, slow=True)
        tts.save(filename)
        print(f"✅ Saved: {filename} ({audio_type})")

    print(f"\nAll {len(commands)} commands converted to speech and saved in '{output_dir}' folder.")

    # Optional: Play the first file to confirm audio
    try:
        os.system(f"start {os.path.join(output_dir, 'command_1_Clear_male_voice.mp3')}")  # Windows
    except:
        os.system(f"open {os.path.join(output_dir, 'command_1_Clear_male_voice.mp3')}")  # macOS / Linux


if __name__ == "__main__":
    # Each tuple = (Audio Type, Command)
    commands = [
        ("Clear male voice", "Turn on the living room lights."),
        ("Clear female voice", "Set the thermostat to 22 degrees Celsius."),
        ("Fast speech", "Play my evening chill playlist."),
        ("Noisy background", "Lock the front door."),
        ("Soft voice", "What's the weather like today?"),
        ("Normal voice", "Remind me to call mom at 6 PM."),
    ]

    commands_to_speech_files(commands)


✅ Saved: assistant_audio/command_1_Clear_male_voice.mp3 (Clear male voice)
✅ Saved: assistant_audio/command_2_Clear_female_voice.mp3 (Clear female voice)
✅ Saved: assistant_audio/command_3_Fast_speech.mp3 (Fast speech)
✅ Saved: assistant_audio/command_4_Noisy_background.mp3 (Noisy background)
✅ Saved: assistant_audio/command_5_Soft_voice.mp3 (Soft voice)
✅ Saved: assistant_audio/command_6_Normal_voice.mp3 (Normal voice)

All 6 commands converted to speech and saved in 'assistant_audio' folder.


sh: 1: start: not found


In [1]:
import os
import numpy as np
import librosa
from scipy.io.wavfile import write as wavwrite
import speech_recognition as sr
import whisper
from vosk import Model, KaldiRecognizer
import json

# Paths
audio_folder = "/mnt/a/MSAIM/trimister-5 msaiml/speech processing/Speech processing and recognisation/PersonalAssistance/assistant_audio"
vosk_model_path = os.path.join(audio_folder, "/mnt/a/MSAIM/trimister-5 msaiml/speech processing/Speech processing and recognisation/vosk/vosk-model-small-en-us-0.15")

# Initialize models
whisper_model = whisper.load_model("base")
vosk_model = Model(vosk_model_path)
recognizer = sr.Recognizer()

def mp3_to_wav_librosa(mp3_path, sr_target=16000):
    """Convert MP3 to WAV in memory and save temp file."""
    y, sr_orig = librosa.load(mp3_path, sr=sr_target)
    wav_path = mp3_path.replace(".mp3", "_temp.wav")
    wavwrite(wav_path, sr_target, (y * 32767).astype(np.int16))  # Convert float -> int16
    return wav_path

def transcribe_whisper(audio_path):
    print("Recognizing with Whisper...")
    try:
        result = whisper_model.transcribe(audio_path)
        text = result.get("text", "")
        print("Speech successfully converted to text!")
        return text
    except Exception as e:
        return f"Whisper error: {str(e)}"

def transcribe_vosk(audio_path):
    print("Recognizing with Vosk...")
    try:
        with sr.AudioFile(audio_path) as source:
            audio_data = recognizer.record(source)
        rec = KaldiRecognizer(vosk_model, 16000)
        rec.AcceptWaveform(audio_data.get_raw_data())
        result_json = json.loads(rec.Result())
        text = result_json.get("text", "")
        print("Speech successfully converted to text!")
        return text
    except Exception as e:
        return f"Vosk error: {str(e)}"

def transcribe_google(audio_path):
    print("Recognizing with Google Speech API...")
    try:
        with sr.AudioFile(audio_path) as source:
            audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
        print("Speech successfully converted to text!")
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand audio."
    except sr.RequestError as e:
        return f"Google API unavailable: {e}"

# Process all MP3 files
for file_name in os.listdir(audio_folder):
    if file_name.endswith(".mp3"):
        print(f"\nProcessing file: {file_name}")
        mp3_path = os.path.join(audio_folder, file_name)
        
        # Convert MP3 -> WAV without ffmpeg
        wav_path = mp3_to_wav_librosa(mp3_path)
        
        whisper_text = transcribe_whisper(mp3_path)  # Whisper can read MP3 directly
        vosk_text = transcribe_vosk(wav_path)
        google_text = transcribe_google(wav_path)
        
        print("\n--- Comparative Analysis ---")
        print(f"Whisper Output: {whisper_text}")
        print(f"Vosk Output: {vosk_text}")
        print(f"Google API Output: {google_text}")
        
        # Optional: delete temp WAV
        os.remove(wav_path)


LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /mnt/a/MSAIM/trimister-5 msaiml/speech processing/Speech processing and recognisation/vosk/vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from /mnt/a/MSAIM/trimister-5 msaiml/speech processing/Speech processing and recognisation/vosk/vosk-model-small-en-us-0.15/graph/HCLr.fst /mnt/a/MSAIM/trimister-5 msaiml/speech processing/Speech processing and rec


Processing file: command_1_Clear_male_voice.mp3
Recognizing with Whisper...
Recognizing with Vosk...
Speech successfully converted to text!
Recognizing with Google Speech API...
Speech successfully converted to text!

--- Comparative Analysis ---
Whisper Output: Whisper error: Failed to load audio: ffmpeg: error while loading shared libraries: libiconv.so.2: cannot open shared object file: No such file or directory

Vosk Output: don't want the living room lights
Google API Output: turn on the living room lights

Processing file: command_2_Clear_female_voice.mp3
Recognizing with Whisper...
Recognizing with Vosk...
Speech successfully converted to text!
Recognizing with Google Speech API...
Speech successfully converted to text!

--- Comparative Analysis ---
Whisper Output: Whisper error: Failed to load audio: ffmpeg: error while loading shared libraries: libiconv.so.2: cannot open shared object file: No such file or directory

Vosk Output: set the thermostat to twenty two degrees celsi