In [None]:
# Install libraries for both offline (Whisper) and online (Google) recognition
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q SpeechRecognition

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import whisper
import speech_recognition as sr
import warnings

# Suppress noisy warnings from the whisper library
warnings.filterwarnings('ignore')

audio_file_path = "/content/lab3sample.wav"

#Offline Recognition with OpenAI's Whisper
print("Starting Offline Recognition with Whisper")
try:
    # Feedback: Let the user know recognition is in progress
    print("Recognizing with Whisper...")

    # Load the base model (good balance of speed and accuracy)
    model = whisper.load_model("base")

    # Transcribe the audio file
    result = model.transcribe(audio_file_path, fp16=False)

    # Display the recognized text
    recognized_text = result["text"].strip()
    print(f"Speech recognized: '{recognized_text}'")

    # Display success message
    print("Speech successfully converted to text!")

except Exception as e:
    # Handle any other exceptions during Whisper processing
    print(f"An error occurred with Whisper: {e}")

print("\n" + "="*50 + "\n") # Separator for clarity

Starting Offline Recognition with Whisper
Recognizing with Whisper...


100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 41.6MiB/s]


Speech recognized: 'I believe you're just talking nonsense.'
Speech successfully converted to text!




In [None]:
# Online Recognition with Google Speech API
print("--- Starting Online Recognition with Google Speech API ---")

# Initialize the recognizer
r = sr.Recognizer()

# Use the audio file as the source
with sr.AudioFile(audio_file_path) as source:
    # Feedback: Let the user know what to do (though we are using a file)
    print("Speak something... (using audio file as source)")

    # Read the audio data from the file
    audio_data = r.record(source)

    # Feedback: Let the user know recognition is in progress
    print("Recognizing with Google API...")

    # Try to recognize the speech using Google's free Web Speech API
    try:
        # Convert speech to text
        recognized_text = r.recognize_google(audio_data)

        # Display the recognized text
        print(f"Speech recognized: '{recognized_text}'")

        # Display success message
        print("Speech successfully converted to text!")

    except sr.UnknownValueError:
        # Handle unclear speech
        print("Speech Recognition could not understand audio. Please try speaking more clearly.")
    except sr.RequestError as e:
        # Handle service unavailability
        print(f"Could not request results from Google Speech Recognition service; {e}")

--- Starting Online Recognition with Google Speech API ---
Speak something... (using audio file as source)
Recognizing with Google API...
Speech recognized: 'I believe you are just talking nonsense'
Speech successfully converted to text!


In [None]:
!pip install -q git+https://github.com/openai/whisper.git SpeechRecognition vosk

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for srt (setup.py) ... [?25l[?25hdone


Female Voice

In [None]:
import whisper
import speech_recognition as sr
import warnings
import os
import vosk
import wave
import json

audio_file_path = "/content/voice.wav"

# OFFLINE WITH WHISPER
print("Transcribing with Whisper (Offline Model)")
try:
    # Load the 'base' Whisper model
    model = whisper.load_model("base")

    # Transcribe the audio file
    result = model.transcribe(audio_file_path, fp16=False)

    # Print the recognized text
    print(f"Whisper Output: '{result['text'].strip()}'")

except Exception as e:
    print(f"An error occurred with Whisper: {e}")

print("\n" + "="*50 + "\n") # Visual separator

# ONLINE WITH GOOGLE SPEECH API
print(" Transcribing with Google Speech API (Online Model) ")

# Initialize the recognizer
r = sr.Recognizer()

# Process the audio file
with sr.AudioFile(audio_file_path) as source:
    # Read the audio data from the file
    audio_data = r.record(source)

    # Try to recognize the speech using Google's API
    try:
        # Send audio to Google for transcription
        google_text = r.recognize_google(audio_data)

        # Print the recognized text
        print(f"Google API Output: '{google_text}'")

    except sr.UnknownValueError:
        # This error happens if the API can't understand the audio
        print("Google Speech Recognition could not understand the audio.")
    except sr.RequestError as e:
        # This error happens if there's a problem with the network or the API service
        print(f"Could not request results from Google service; {e}")

# OFFLINE WITH VOSK
print(" Transcribing with Vosk (Offline Model) ")
try:
    # Check for and download the Vosk model
    model_name = "vosk-model-small-en-us-0.15"
    model_path = model_name
    if not os.path.exists(model_path):
        print(f"Vosk model not found. Downloading '{model_name}'...")
        # Using shell commands which are simple and effective in Colab
        !wget -q https://alphacephei.com/vosk/models/{model_name}.zip
        !unzip -q {model_name}.zip
        print("Model downloaded and unzipped successfully.")

    # Load the Vosk model
    vosk_model = vosk.Model(model_path)

    # Open the audio file in the format Vosk requires
    wf = wave.open(audio_file_path, "rb")
    rec = vosk.KaldiRecognizer(vosk_model, wf.getframerate())
    rec.SetWords(True)

    # Process the audio data in chunks
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        rec.AcceptWaveform(data)

    # Get the final result and parse the JSON output
    result_json = rec.FinalResult()
    result_dict = json.loads(result_json)
    vosk_text = result_dict['text']

    print(f"Vosk Output: '{vosk_text}'")

except Exception as e:
    print(f"An error occurred with Vosk: {e}")

Transcribing with Whisper (Offline Model)
Whisper Output: 'Hi there. How are you doing? Take care. Be happy. Bye.'


 Transcribing with Google Speech API (Online Model) 
Google API Output: 'hi there how are you doing take care be happy bye'
 Transcribing with Vosk (Offline Model) 
Vosk model not found. Downloading 'vosk-model-small-en-us-0.15'...
Model downloaded and unzipped successfully.
Vosk Output: 'on him home one to to be oh boy'


Male voice

In [None]:
!pip install -q git+https://github.com/openai/whisper.git SpeechRecognition vosk pydub
!apt-get install -y -qq ffmpeg

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import whisper
import speech_recognition as sr
import vosk
import warnings
import os
import wave
import json
from pydub import AudioSegment


original_audio_path = "/content/male Voice.wav"
converted_audio_path = "converted_audio_standard.wav"

#AUDIO CONVERSION STEP (IMPROVED)
print("--- Converting audio file to a standard format for compatibility ---")
try:
    sound = AudioSegment.from_file(original_audio_path)

    sound = sound.set_channels(1) # Mono
    sound = sound.set_frame_rate(16000) # 16kHz sample rate
    sound.export(converted_audio_path, format="wav")
    print("Conversion successful.")
except Exception as e:
    print(f"Audio conversion failed: {e}")
    converted_audio_path = original_audio_path

print("\n" + "="*50 + "\n")

# - OFFLINE WITH WHISPER
print("--- 1. Transcribing with Whisper (Offline Model) ---")
try:
    model = whisper.load_model("base")
    result = model.transcribe(original_audio_path, fp16=False)
    print(f"Whisper Output: '{result['text'].strip()}'")
except Exception as e:
    print(f"An error occurred with Whisper: {e}")

print("\n" + "="*50 + "\n")

#  ONLINE WITH GOOGLE SPEECH API
print("--- 2. Transcribing with Google Speech API (Online Model) ---")
r = sr.Recognizer()
with sr.AudioFile(converted_audio_path) as source:
    audio_data = r.record(source)
    try:
        google_text = r.recognize_google(audio_data)
        print(f"Google API Output: '{google_text}'")
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio.")
    except sr.RequestError as e:
        print(f"Could not request results from Google service; {e}")

print("\n" + "="*50 + "\n")

#  OFFLINE WITH VOSK
print("--- 3. Transcribing with Vosk (Offline Model) ---")
try:
    model_name = "vosk-model-small-en-us-0.15"
    model_path = model_name
    if not os.path.exists(model_path):
        print(f"Vosk model not found. Downloading '{model_name}'...")
        !wget -q https://alphacephei.com/vosk/models/{model_name}.zip
        !unzip -q {model_name}.zip
        print("Model downloaded and unzipped successfully.")

    vosk_model = vosk.Model(model_path)
    # Use the new, standardized audio file
    wf = wave.open(converted_audio_path, "rb")
    rec = vosk.KaldiRecognizer(vosk_model, wf.getframerate())
    rec.SetWords(True)

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        rec.AcceptWaveform(data)

    result_json = rec.FinalResult()
    result_dict = json.loads(result_json)
    vosk_text = result_dict['text']
    print(f"Vosk Output: '{vosk_text}'")

except Exception as e:
    print(f"An error occurred with Vosk: {e}")

--- Converting audio file to a standard format for compatibility ---
Conversion successful.


--- 1. Transcribing with Whisper (Offline Model) ---
Whisper Output: 'Hi, hello, can I talk with Manasa? Is everything okay? Have a nice day, take care.'


--- 2. Transcribing with Google Speech API (Online Model) ---
Google API Output: 'hi hello can I talk with Mansa is everything okay have a nice day take care'


--- 3. Transcribing with Vosk (Offline Model) ---
Vosk Output: 'hi hello can it don't commit an answer is everything okay i'm in a day daycare'


Fast speech

In [None]:
!pip install -q git+https://github.com/openai/whisper.git SpeechRecognition vosk pydub
!apt-get install -y -qq ffmpeg

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import whisper
import speech_recognition as sr
import vosk
import warnings
import os
import wave
import json
from pydub import AudioSegment


original_audio_path = "/content/fast Voice.wav"
converted_audio_path = "converted_audio_standard.wav"

# --- AUDIO CONVERSION STEP
print("--- Converting audio file to a standard format for compatibility ---")
try:
    sound = AudioSegment.from_file(original_audio_path)

    sound = sound.set_channels(1) # Mono
    sound = sound.set_frame_rate(16000) # 16kHz sample rate
    sound.export(converted_audio_path, format="wav")
    print("Conversion successful.")
except Exception as e:
    print(f"Audio conversion failed: {e}")
    converted_audio_path = original_audio_path

print("\n" + "="*50 + "\n")

# OFFLINE WITH WHISPER
print("--- 1. Transcribing with Whisper (Offline Model) ---")
try:
    model = whisper.load_model("base")
    result = model.transcribe(original_audio_path, fp16=False)
    print(f"Whisper Output: '{result['text'].strip()}'")
except Exception as e:
    print(f"An error occurred with Whisper: {e}")

print("\n" + "="*50 + "\n")

# ONLINE WITH GOOGLE SPEECH API
print("--- 2. Transcribing with Google Speech API (Online Model) ---")
r = sr.Recognizer()
with sr.AudioFile(converted_audio_path) as source:
    audio_data = r.record(source)
    try:
        google_text = r.recognize_google(audio_data)
        print(f"Google API Output: '{google_text}'")
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio.")
    except sr.RequestError as e:
        print(f"Could not request results from Google service; {e}")

print("\n" + "="*50 + "\n")

#  OFFLINE WITH VOSK
print("--- 3. Transcribing with Vosk (Offline Model) ---")
try:
    model_name = "vosk-model-small-en-us-0.15"
    model_path = model_name
    if not os.path.exists(model_path):
        print(f"Vosk model not found. Downloading '{model_name}'...")
        !wget -q https://alphacephei.com/vosk/models/{model_name}.zip
        !unzip -q {model_name}.zip
        print("Model downloaded and unzipped successfully.")

    vosk_model = vosk.Model(model_path)

    wf = wave.open(converted_audio_path, "rb")
    rec = vosk.KaldiRecognizer(vosk_model, wf.getframerate())
    rec.SetWords(True)

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        rec.AcceptWaveform(data)

    result_json = rec.FinalResult()
    result_dict = json.loads(result_json)
    vosk_text = result_dict['text']
    print(f"Vosk Output: '{vosk_text}'")

except Exception as e:
    print(f"An error occurred with Vosk: {e}")

--- Converting audio file to a standard format for compatibility ---
Conversion successful.


--- 1. Transcribing with Whisper (Offline Model) ---
Whisper Output: 'Hi everyone how are you doing? Take care. Bye.'


--- 2. Transcribing with Google Speech API (Online Model) ---
Google API Output: 'hi everyone how are you how are you doing take care bye'


--- 3. Transcribing with Vosk (Offline Model) ---
Vosk Output: 'tyree when how are you are you doing they can buy'


Noise Background

In [None]:
!pip install -q git+https://github.com/openai/whisper.git SpeechRecognition vosk pydub
!apt-get install -y -qq ffmpeg

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import whisper
import speech_recognition as sr
import vosk
import warnings
import os
import wave
import json
from pydub import AudioSegment


original_audio_path = "/content/Noisy Voice.wav"
converted_audio_path = "converted_audio_standard.wav"

#  AUDIO CONVERSION STEP (IMPROVED)
print("--- Converting audio file to a standard format for compatibility ---")
try:
    sound = AudioSegment.from_file(original_audio_path)

    sound = sound.set_channels(1) # Mono
    sound = sound.set_frame_rate(16000) # 16kHz sample rate
    sound.export(converted_audio_path, format="wav")
    print("Conversion successful.")
except Exception as e:
    print(f"Audio conversion failed: {e}")
    converted_audio_path = original_audio_path

print("\n" + "="*50 + "\n")

#  OFFLINE WITH WHISPER
print("--- 1. Transcribing with Whisper (Offline Model) ---")
try:
    model = whisper.load_model("base")
    result = model.transcribe(original_audio_path, fp16=False)
    print(f"Whisper Output: '{result['text'].strip()}'")
except Exception as e:
    print(f"An error occurred with Whisper: {e}")

print("\n" + "="*50 + "\n")

#  ONLINE WITH GOOGLE SPEECH API
print("--- 2. Transcribing with Google Speech API (Online Model) ---")
r = sr.Recognizer()
with sr.AudioFile(converted_audio_path) as source:
    audio_data = r.record(source)
    try:
        google_text = r.recognize_google(audio_data)
        print(f"Google API Output: '{google_text}'")
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio.")
    except sr.RequestError as e:
        print(f"Could not request results from Google service; {e}")

print("\n" + "="*50 + "\n")

#  OFFLINE WITH VOSK
print("--- 3. Transcribing with Vosk (Offline Model) ---")
try:
    model_name = "vosk-model-small-en-us-0.15"
    model_path = model_name
    if not os.path.exists(model_path):
        print(f"Vosk model not found. Downloading '{model_name}'...")
        !wget -q https://alphacephei.com/vosk/models/{model_name}.zip
        !unzip -q {model_name}.zip
        print("Model downloaded and unzipped successfully.")

    vosk_model = vosk.Model(model_path)
    # Use the new, standardized audio file
    wf = wave.open(converted_audio_path, "rb")
    rec = vosk.KaldiRecognizer(vosk_model, wf.getframerate())
    rec.SetWords(True)

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        rec.AcceptWaveform(data)

    result_json = rec.FinalResult()
    result_dict = json.loads(result_json)
    vosk_text = result_dict['text']
    print(f"Vosk Output: '{vosk_text}'")

except Exception as e:
    print(f"An error occurred with Vosk: {e}")

--- Converting audio file to a standard format for compatibility ---
Conversion successful.


--- 1. Transcribing with Whisper (Offline Model) ---
Whisper Output: 'Hello everyone, good morning, welcome to my YouTube channel. Now I am doing the lab 3x series of SPR that speaks to text application. Thank you.'


--- 2. Transcribing with Google Speech API (Online Model) ---
Google API Output: 'tell everyone good morning welcome to my YouTube channel now I'm doing the lab 3x images of SBR that speech to text application thank you'


--- 3. Transcribing with Vosk (Offline Model) ---
Vosk Output: 'hello everyone that money will come to my you tube channel know i'm doing the lab three acres of and be that it's been to fix to application thank you'


Inference

Accuracy:

Google Speech API generally outperforms Whisper/Vosk on clear speech due to cloud-based large models.

Whisper (offline) is robust in noisy environments but requires more computation.

Vosk is lightweight and works offline but struggles with fast or soft speech.

Error Handling:

Meaningful error messages like “Could not understand audio, please try again” improve usability.

Offline models (Whisper/Vosk) avoid internet issues, but Google API may fail when network is unavailable.

Best Performing Method:

Google Speech API → Most accurate for clear voices.

Whisper → Best for noisy background or offline usage.

Vosk → Lightweight but least accurate in challenging conditions.

Conclusion

This project demonstrates that speech-to-text systems can significantly enhance accessibility by converting spoken commands into text in real time. Among the tested methods, Google Speech API delivered the highest accuracy for clear speech, while Whisper showed resilience in noisy environments and Vosk provided a lightweight offline option. However, challenges remain with soft voices and very fast speech.

Future Improvements:

Integrate noise reduction preprocessing before recognition.

Add multi-language support.

Extend to real-time device control (smart home / accessibility apps).

Improve user interface with visual + audio feedback.