In [2]:
# -------------------------
# STEP 1 — Install libs (run once)
# -------------------------
# On Kaggle, transformers/datasets are usually preinstalled.
# To avoid dependency conflicts, install only missing extras:
!pip install -q soundfile librosa gradio

# If you truly need to force-update transformers/torch (not recommended on Kaggle),
# uncomment the next line (will be large and may cause CUDA/version warnings).
# !pip install -q --upgrade transformers datasets torch



In [3]:
# -------------------------
# STEP 2 — Imports & logging (run after STEP 1)
# -------------------------
# Import necessary packages and silence verbose logs so output is clean.
from transformers.utils import logging
logging.set_verbosity_error()    # reduce noise

import os
import numpy as np
import librosa
import soundfile as sf           # used for saving or reading certain formats
from transformers import pipeline


2025-08-28 18:39:22.935459: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756406363.159001      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756406363.222583      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# -------------------------
# STEP 3 — Confirm your uploaded file path
# -------------------------
# List files inside /kaggle/input so you know exact filenames/paths.
root = "/kaggle/input"
for path, dirs, files in os.walk(root):
    if files:
        print("Folder:", path)
        for f in files:
            print("  -", f)

# Now set the exact file path you saw printed above:
file_path = "/kaggle/input/my-voice-input/voice_input.m4a"   # <-- update if different
print("\nUsing file_path =", file_path)


Folder: /kaggle/input/my-voice-input
  - voice_input.m4a

Using file_path = /kaggle/input/my-voice-input/voice_input.m4a


In [6]:
# -------------------------
# STEP 4 — Load the ASR model (this downloads from Hugging Face)
# -------------------------
# Creates a pipeline object `asr` that you will call with audio arrays.
# Note: the first run downloads model weights and may take a minute.
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")

# Inspect what sampling rate the model expects (usually 16000 for Whisper)
print("Model expected sampling rate:", asr.feature_extractor.sampling_rate)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

Model expected sampling rate: 16000


In [7]:
# -------------------------
# STEP 5 — Load audio (handles .m4a, mp3, wav)
# -------------------------
# librosa.load uses audioread as a fallback for formats libsndfile can't read.
# sr=None keeps the original sample rate (we will resample to model_sr later).
audio, sr = librosa.load(file_path, sr=None, mono=True)  # mono=True -> combine channels
print("Loaded audio: shape =", getattr(audio, "shape", None), "sr =", sr)

# If librosa couldn't read the format, you may need ffmpeg or convert to WAV offline.



  audio, sr = librosa.load(file_path, sr=None, mono=True)  # mono=True -> combine channels
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded audio: shape = (797696,) sr = 48000


In [8]:
# -------------------------
# STEP 6 — Preprocess: ensure mono, resample, dtype
# -------------------------
# Make sure audio is 1D numpy array (mono). If not, convert.
if getattr(audio, "ndim", 1) > 1:
    # librosa.load with mono=True normally avoids this, but check defensively
    audio = librosa.to_mono(audio.T)

model_sr = asr.feature_extractor.sampling_rate
if sr != model_sr:
    audio = librosa.resample(audio, orig_sr=sr, target_sr=model_sr)
    sr = model_sr
    print(f"Resampled to {sr} Hz")

# Convert to float32 which most pipelines expect and keep values in [-1,1]
audio = audio.astype(np.float32)

# Quick safety checks
duration_s = len(audio) / sr
print(f"Final audio shape: {audio.shape}, sr: {sr}, duration: {duration_s:.2f}s")


Resampled to 16000 Hz
Final audio shape: (265899,), sr: 16000, duration: 16.62s


In [9]:
# -------------------------
# STEP 7 — Transcribe (safe handling for short & long audio)
# -------------------------
# Choose chunk_length_s and batch_size depending on audio length and memory.
# chunk_length_s splits long audio so model processes in manageable pieces.
chunk_length_s = 30
batch_size = 4

# Run the ASR. For short audios this returns {'text': ...}.
# For longer audio or when return_timestamps=True you may also get 'chunks'.
result = asr(audio, chunk_length_s=chunk_length_s, batch_size=batch_size, return_timestamps=True)

# Print raw result for inspection
print("Raw pipeline output:\n", result)

# Extract final transcript:
if isinstance(result, dict) and "chunks" in result:
    # Join chunk texts (keeps timestamps if you need them)
    transcript = " ".join(chunk["text"].strip() for chunk in result["chunks"])
else:
    transcript = result.get("text") if isinstance(result, dict) else str(result)

print("\n=== Final transcript ===\n", transcript)




Raw pipeline output:
 {'text': " well let's see what we are trying to do we are implementing automatic speech legalization and currently I am recording my voice for giving as an input well let's see what we can get with this model", 'chunks': [{'timestamp': (0.0, 5.26), 'text': " well let's see what we are trying to do we are implementing automatic speech"}, {'timestamp': (5.26, 12.6), 'text': ' legalization and currently I am recording my voice for giving as an input'}, {'timestamp': (12.6, 17.38), 'text': " well let's see what we can get with this model"}]}

=== Final transcript ===
 well let's see what we are trying to do we are implementing automatic speech legalization and currently I am recording my voice for giving as an input well let's see what we can get with this model


In [10]:
# -------------------------
# STEP 8 — (Optional) Save processed mono/resampled WAV for download
# -------------------------
# Save into /kaggle/working so you can download from the notebook output.
out_wav = "/kaggle/working/voice_input_processed.wav"
sf.write(out_wav, audio, sr)   # soundfile writes float32 or PCM automatically
print("Saved processed WAV to:", out_wav)
# You can download the file via the Notebook -> Output -> Files section.


Saved processed WAV to: /kaggle/working/voice_input_processed.wav


In [12]:
# -------------------------
# STEP 9 — (Optional) Quick Gradio UI to record or upload audio (run last)
# -------------------------
import gradio as gr

def transcribe_file(filepath):
    if filepath is None:
        return "No file provided."
    # Pass file path directly to the pipeline
    out = asr(filepath, chunk_length_s=30, batch_size=4, return_timestamps=False)
    return out.get("text", str(out))

# Gradio Audio component (no "source" argument in latest Gradio)
iface = gr.Interface(
    fn=transcribe_file,
    inputs=gr.Audio(type="filepath"),   # works for both microphone & upload
    outputs=gr.Textbox(lines=5, label="Transcription"),
    title="Record or Upload and Transcribe"
)

# Launch the UI
iface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://269fb5a6828b06630c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [14]:
del asr
import gc, torch
gc.collect()
try:
    torch.cuda.empty_cache()
except Exception:
    pass

NameError: name 'asr' is not defined

In [15]:
# ------------------------------------------------------------
# PROBLEM STATEMENT:
# ------------------------------------------------------------
# We wanted to automatically convert speech (audio) into text 
# in Kaggle/Colab.
# In simple words: 
#   "Given an audio file (or recording), transcribe the spoken 
#    words into text using Python."

# ------------------------------------------------------------
# SOLUTION STEPS:
# ------------------------------------------------------------

# 1. MODEL SETUP
#    - Loaded a pretrained speech-to-text model (Whisper) from Hugging Face.
#    - This model is already trained to recognize speech and output text.

# 2. AUDIO INPUT HANDLING
#    - Allowed audio input in two ways:
#        a) Uploading a file (e.g., .wav, .mp3, .m4a).
#        b) Recording directly from the microphone (via Gradio UI).

# 3. TRANSCRIPTION PIPELINE
#    - Built a function that:
#        -> Takes the audio file as input.
#        -> Feeds it to the Whisper ASR pipeline.
#        -> Returns the recognized text.

# 4. OPTIMIZATION FOR LONG AUDIO
#    - Used chunking (chunk_length_s=30, batch_size=4).
#    - This breaks long audio into smaller pieces so the model
#      processes them safely without running out of memory.

# 5. MEMORY MANAGEMENT
#    - Freed up memory after usage with:
#        -> gc.collect()
#        -> torch.cuda.empty_cache()
#    - Useful if reloading or switching models.

# 6. GRADIO UI (Optional but User-Friendly)
#    - Built a small interface with Gradio where:
#        -> Users can record audio or upload a file.
#        -> The transcribed text is shown in a textbox.
#    - Makes the solution easier for non-programmers to use.
