# create a notebook for audio to text 

The first step is to create an interface for uploading files. For this, we’ll use ipywidgets, a Python library that provides interactive UI components beyond plain text.

In [1]:
#this is a library that has sliders, dropdowns, checkboxes, text boxes, and file upload buttons
import ipywidgets as widgets
# os is used for working with files, directories, and environment variables.
import os
# this is to show more interesting outputs than just plain text
from IPython.display import display

uploader = widgets.FileUpload(accept='.mp3,.wav', multiple=True)
display(uploader)

FileUpload(value=(), accept='.mp3,.wav', description='Upload', multiple=True)

The next step is to save the uploaded files into a local folder within our project so they can be accessed whenever needed.

In [None]:
def save_uploaded_mp3s(uploader, dest_dir="uploads"):
    # Creating destination directory if it doesn't exist
    os.makedirs(dest_dir, exist_ok=True)
    saved_paths = []
    # Iterating over uploaded files and save them
    for file_info in uploader.value:   
        name = file_info["name"]
        content = file_info["content"]
        out_path = os.path.join(dest_dir, name)
        # Save file content to the destination path
        with open(out_path, "wb") as f:
            f.write(content)
        saved_paths.append(out_path)

    return saved_paths

In [None]:
#make sure files are uploaded
print(uploader.value)


()


Now that our audio files are uploaded and saved, the next step is to transcribe them into text. We’ll use the Transformers library, which provides a simple way to load and run pretrained models like OpenAI’s Whisper for speech recognition.

In [None]:
# using the transformers library from huggingface to use the whisper model
from transformers import pipeline
#load the whisper model
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small") 
# the small version does not need a huge amount of memory or processing power we can change this to large if we want better accuracy

#function to transcribe the mp3 files
def transcribe_mp3s(saved_paths):
    transcriptions = {}
    for path in saved_paths:
        result = transcriber(path)
        transcriptions[os.path.basename(path)] = result['text']
    #save each transcription to a text file
        with open(path + ".txt", "w") as f:
            f.write(result['text'])
    return transcriptions 
saved_paths = save_uploaded_mp3s(uploader)
transcriptions = transcribe_mp3s(saved_paths)
#show the transcriptions 
for file_name, text in transcriptions.items():
    print(f"Transcription for {file_name}:\n{text}\n")
    



Device set to use mps:0
