In [None]:
import gradio as gr
from transformers import pipeline
import numpy as np
import os


In [None]:
# 1. Load the Whisper ASR Model
# We'll use a pre-trained Whisper model from Hugging Face.
# 'openai/whisper-small' is a good balance between performance and size.
# You can choose other models like 'openai/whisper-tiny', 'openai/whisper-base',
# 'openai/whisper-medium', or 'openai/whisper-large' based on your needs
# for accuracy and computational resources.
# For English-only transcription, you can use 'openai/whisper-small.en'
# which is generally faster and slightly more accurate for English.
try:
    asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small")
    print("Whisper model loaded successfully!")
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    print("Please ensure you have an active internet connection and sufficient disk space.")
    # Fallback or exit if model loading fails
    asr_pipeline = None


In [None]:
def transcribe_audio(audio_input):
    """
    Transcribes the input audio (microphone or file) to text using the Whisper model.

    Args:
        audio_input: This will be the file path to the recorded/uploaded audio
                     provided by Gradio's gr.Audio(type="filepath").

    Returns:
        A string containing the transcribed text, or an error message.
    """
    if asr_pipeline is None:
        return "Error: ASR model failed to load. Cannot transcribe."

    if audio_input is None:
        return "No audio input provided. Please record or upload an audio file."

    # The audio_input will be a filepath when type="filepath" is used in gr.Audio.
    # The pipeline handles loading the audio from the filepath, resampling, etc.
    try:
        # Perform the transcription
        # You can add generate_kwargs like 'task="transcribe"' and 'language="english"'
        # for more control, especially with multilingual models.
        transcription_result = asr_pipeline(audio_input)

        # The result is a dictionary, and the transcribed text is under the 'text' key.
        transcribed_text = transcription_result.get("text", "Transcription failed.")

        # Clean up the temporary audio file generated by Gradio
        if os.path.exists(audio_input):
            os.remove(audio_input)
            print(f"Removed temporary audio file: {audio_input}")

        return transcribed_text

    except Exception as e:
        # Catch any errors during transcription (e.g., corrupted file, model issues)
        print(f"Error during transcription: {e}")
        return f"An error occurred during transcription: {str(e)}"


In [None]:
# 2. Create the Gradio Interface
# We'll define the input and output components for our Gradio app.
# gr.Audio: Allows users to record audio via microphone or upload an audio file.
#           Setting type="filepath" makes the function receive a temporary file path.
# gr.Textbox: Displays the transcribed text.

iface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(
        sources=["microphone", "upload"], # Allow both microphone recording and file upload
        type="filepath",                 # Input will be a path to a temporary audio file
        label="Input Audio (Record or Upload)"
    ),
    outputs=gr.Textbox(label="Transcribed Text"),
    title="üéôÔ∏è Audio-to-Text Transcriber with Whisper",
    description="Record audio using your microphone or upload an audio file to get it transcribed into text using an open-source Whisper ASR model.",
    live=False,  # Set to True for real-time transcription as you speak (more complex to implement properly)
                 # For simplicity, we'll process the audio after recording/uploading is complete.
    allow_flagging="auto", # Allows users to flag examples, which can be useful for debugging
    examples=[
        # Example audio files (you'll need to provide actual paths or URLs if using)
        # For a basic example, we'll just show the component
    ],
    # You can add custom CSS for styling
    css="""
    body { font-family: 'Inter', sans-serif; background-color: #f0f4f8; }
    .gradio-container { max-width: 800px; margin: 30px auto; padding: 20px; border-radius: 12px; box-shadow: 0 4px 20px rgba(0, 0, 0, 0.1); background-color: #ffffff; }
    h1 { color: #2c3e50; text-align: center; margin-bottom: 20px; }
    p { color: #34495e; text-align: center; margin-bottom: 30px; }
    .gr-button { background-color: #3498db; color: white; border-radius: 8px; padding: 10px 20px; font-size: 16px; transition: background-color 0.3s ease; }
    .gr-button:hover { background-color: #2980b9; }
    .gr-audio-input, .gr-textbox { border-radius: 8px; border: 1px solid #ccc; padding: 10px; margin-bottom: 15px; }
    .gr-audio-input audio { border-radius: 8px; }
    """
)


In [None]:
# 3. Launch the Gradio App
# This will start the web server and open the app in your browser.
if __name__ == "__main__":
    iface.launch(share=True) # share=True generates a public link (valid for 72 hours)
                             # for easy sharing, useful for testing.
