# create a notebook for audio to text 

The first step is to create an interface for uploading files. For this, we’ll use ipywidgets, a Python library that provides interactive UI components beyond plain text.

In [None]:
#this is a library that has sliders, dropdowns, checkboxes, text boxes, and file upload buttons
import ipywidgets as widgets
# os is used for working with files, directories, and environment variables.
import os
# this is to show more interesting outputs than just plain text
from IPython.display import display

uploader = widgets.FileUpload(accept='.mp3,.wav', multiple=True)
display(uploader)


The next step is to save the uploaded files into a local folder within our project so they can be accessed whenever needed.

In [None]:
def save_uploaded_mp3s(uploader, dest_dir="uploads"):
    os.makedirs(dest_dir, exist_ok=True)
    saved_paths = []

    for file_info in uploader.value:
        filename = file_info["name"]
        content = file_info["content"] # the contect is not in bytes, it is in memoryview 
        # memoryview is a view of the original data, so
        # we need to convert it to bytes before saving it to a file
        if isinstance(content, memoryview):
            content = content.tobytes()

        out_path = os.path.join(dest_dir, filename)
        with open(out_path, "wb") as f:
            f.write(content)

        print("Saved:", out_path)
        saved_paths.append(out_path)

    return saved_paths

paths = save_uploaded_mp3s(uploader)
print(paths)



Now that our audio files are uploaded and saved, the next step is to transcribe them into text. We’ll use the Transformers library, which provides a simple way to load and run pretrained models like OpenAI’s Whisper for speech recognition.

In [None]:
import os
from transformers import pipeline

# Initializing transcriber
# Using the small model for faster processing; you can choose other models if needed
#we want to use chunking and overlapping to handle long audio files (because the model has a limit on input length of 30s)
# .mp3 and .wav and .flac files are supported if you want to use other formats, you need to convert them to one of these formats first
# and change the first cell to accept those formats

transcriber = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    chunk_length_s=30,   # 30s chunks
    stride_length_s=5    # 5s overlap
)
# Function to transcribe audio files and save them as text files next to the audio files
def transcribe_and_save(paths):
    for audio_path in paths:
        # Transcribe the audio file
        result = transcriber(audio_path)
        transcription = result["text"]

        # Create a text file path by replacing the audio file extension with .txt
        base, _ = os.path.splitext(audio_path)
        text_file_path = f"{base}.txt"

        # Save the transcription to the text file
        with open(text_file_path, "w") as text_file:
            text_file.write(transcription)

transcribe_and_save(paths)

