In [2]:
# ============================
# üß† SPEECH TO TEXT SYSTEM (WITH SAMPLE AUDIO)
# ============================

# Step 1: Install dependencies
!pip install SpeechRecognition pydub transformers torchaudio librosa soundfile wget requests --quiet

# Step 2: Import libraries
import speech_recognition as sr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import wget
import requests # Import requests library
from pydub import AudioSegment # Import AudioSegment from pydub


# ============================
# STEP 3: Download a sample audio file
# ============================
# We'll use a short clean English speech sample
# The previous URL resulted in a Forbidden error. Using an alternative URL.
sample_url = "https://www.soundhelix.com/examples/mp3/SoundHelix-Song-1.mp3"
audio_path_mp3 = "sample_audio.mp3"
audio_path_wav = "sample_audio.wav"


# Use requests to download the file
try:
    response = requests.get(sample_url, stream=True)
    response.raise_for_status() # Raise an exception for bad status codes
    with open(audio_path_mp3, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"\n‚úÖ Sample audio downloaded: {audio_path_mp3}")

    # Convert MP3 to WAV
    audio = AudioSegment.from_mp3(audio_path_mp3)
    audio.export(audio_path_wav, format="wav")
    print(f"‚úÖ Converted MP3 to WAV: {audio_path_wav}")


except requests.exceptions.RequestException as e:
    print(f"\n‚ùå Error downloading sample audio: {e}")
    # Exit the script or handle the error appropriately
    exit()


# ============================
# METHOD 1: SpeechRecognition
# ============================

def transcribe_speech_recognition(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        print("\nüé§ [SpeechRecognition Output]:")
        print(text)
    except sr.UnknownValueError:
        print("Could not understand audio")
    except sr.RequestError:
        print("Could not connect to Google API")

# Run SpeechRecognition
transcribe_speech_recognition(audio_path_wav)

# ============================
# METHOD 2: Wav2Vec2 (Deep Learning)
# ============================

# Load Wav2Vec2 model & processor
print("\n‚è≥ Loading Wav2Vec2 model (may take a moment)...")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

def transcribe_wav2vec2(audio_path):
    # Load audio file
    speech, rate = torchaudio.load(audio_path)
    # Wav2Vec2 expects 16kHz audio
    if rate != 16000:
        resampler = torchaudio.transforms.Resample(rate, 16000)
        speech = resampler(speech)
    # Ensure the audio is mono (single channel)
    if speech.shape[0] > 1:
        speech = torch.mean(speech, dim=0, keepdim=True)

    input_values = processor(speech.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values

    # Inference
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    print("\nüß† [Wav2Vec2 Output]:")
    print(transcription.lower())

# Run Wav2Vec2 recognizer
transcribe_wav2vec2(audio_path_wav)

print("\n‚úÖ Transcription complete!")

  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):



‚úÖ Sample audio downloaded: sample_audio.mp3
‚úÖ Converted MP3 to WAV: sample_audio.wav
Could not connect to Google API

‚è≥ Loading Wav2Vec2 model (may take a moment)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)



üß† [Wav2Vec2 Output]:
aeaaaaaaaar mara ma mama m  a mama a me  me  emaa m  r m me r r  ma m meme mem

‚úÖ Transcription complete!
