In [1]:
import math
import os
import re

from openai import OpenAI
from pydub import AudioSegment
from pydub.utils import mediainfo

client = OpenAI()

In [2]:
FILEPATH = "audio"


def standardize_file_name(file_name):
    base_name = os.path.splitext(file_name)[0]
    # check if the file name matches the format "TFL_S2E22.wav"
    match = re.match(r"TFL_S(\d+)E(\d+)", base_name, re.IGNORECASE)
    if match:
        return f"TFL_S{match.group(1)}E{match.group(2)}_transcript.txt"
    else:
        season_match = re.search(r"S(\d+)", base_name, re.IGNORECASE)
        episode_match = re.search(
            r"Ep(\d+)|Episode(\d+)|S\d+E(\d+)|TFL_(\d+)", base_name, re.IGNORECASE
        )
        season = "S2" if season_match and season_match.group(1) == "2" else "S1"
        try:
            episode = str(
                int(
                    next(
                        (group for group in episode_match.groups() if group is not None),
                        None,
                    )
                )
            )
        except TypeError:
            raise ValueError(f"Episode number not found in file name: {file_name}")
        return f"TFL_{season}E{episode}_transcript.txt"

def transcribe_audio(file_name):
    """Transcribe a .wav file using OpenAI's Whisper API and save the transcript.
    Args:
        file_name: Name of the .wav file to transcribe.
    """
    # Open the audio file
    audio = AudioSegment.from_wav(f"{FILEPATH}/{file_name}")

    # Get information about the audio file
    info = mediainfo(f"{FILEPATH}/{file_name}")
    audio_size_mb = int(info["size"]) / (1024 * 1024)

    # Calculate the segment length in milliseconds to be less than 25MB
    segment_length_ms = (25 * 1024 * 1024 * 8 / int(info["bit_rate"])) * 1000

    # Split the audio into segments
    segments = [
        audio[i : i + int(segment_length_ms)]
        for i in range(0, len(audio), int(segment_length_ms))
    ]

    print(
        f"Transcribing {file_name} ({audio_size_mb:.2f} MB), with {len(segments)} segments"
    )

    guidelines = """The transcript is an episode of a podcast called the Feelings Lab,
    which explores emotion science and its lessons for creating a more empathic future.
    It is produced by Hume AI, a research lab and technology company. Its goal is to build equity,
    empathy, and compassion into ubiquitous artificial intelligence systems."""

    # Transcribe each segment and combine the transcripts
    transcript = ""
    for i, segment in enumerate(segments):
        # Export the segment to a temporary .wav file
        segment_file_name = f"temp_{i}.wav"
        segment.export(
            segment_file_name, format="wav", parameters=["-ac", "1", "-ar", "16000"]
        )

        # Transcribe the segment
        print(f"Transcribing segment {i+1} of {len(segments)} from {file_name}")
        with open(segment_file_name, "rb") as segment_file:
            response = client.audio.transcriptions.create(
                model="whisper-1",
                file=segment_file,
                response_format="text",
                language="en",
                prompt=guidelines,
            )
        print(f"...Transcription starts with '{response[:50]}'")
        transcript += response

        # Delete the temporary .wav file
        os.remove(segment_file_name)

    # Save the transcribed text
    standardized_name = standardize_file_name(file_name)
    with open(f"text/{standardized_name}", "w") as text_file:
        text_file.write(transcript)

In [3]:
audio_files = [f for f in os.listdir("audio") if f.endswith(".wav")]

for episode in audio_files:
    transcript_file = f"text/{standardize_file_name(episode)}"
    if not os.path.exists(transcript_file):
        print(f"Transcribing {episode}")
        transcribe_audio(episode)
    else:
        print(f"Transcript for {episode} already exists")

Transcript for TFL_S1E10.wav already exists
Transcript for TFL_S1E8.wav already exists
Transcript for TFL_S1E9.wav already exists
Transcribing TFL_S2E20.wav
Transcribing TFL_S2E20.wav (708.13 MB), with 29 segments
Transcribing segment 1 of 29 from TFL_S2E20.wav
...Transcription starts with 'Hello, world. What is up? Welcome back to the Feel'
Transcribing segment 2 of 29 from TFL_S2E20.wav
...Transcription starts with 'We're going to leap forward in ways we'd only prev'
Transcribing segment 3 of 29 from TFL_S2E20.wav
...Transcription starts with 'We're thrilled to have you here. We're going to ha'
Transcribing segment 4 of 29 from TFL_S2E20.wav
...Transcription starts with 'Right. I'm no scientist, but my assumption is if i'
Transcribing segment 5 of 29 from TFL_S2E20.wav
...Transcription starts with 'Was that the kind of thing that you were just hopi'
Transcribing segment 6 of 29 from TFL_S2E20.wav
...Transcription starts with 'See your hype about it. But it's not until you see'
Transc