In [2]:
import assemblyai as aai
import os
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Check for API key
if not os.getenv("ASSEMBLYAI_API_KEY"):
    print("Error: ASSEMBLYAI_API_KEY not found in environment variables.")
    print("Please create a .env file with your API key.")

In [12]:
import json
from datetime import datetime
import os
from pathlib import Path

# Select the file to transcribe
audio_file_path = "../podcast_downloads/S2E282 The SAT Isn’t About Intelligence… Here’s How to Win (Part 1).mp3"
output_dir = "../transcripts"

if not Path(audio_file_path).exists():
    print(f"File not found: {audio_file_path}")
else:
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Transcribing: {audio_file_path}")

    # Configure transcription with speaker labels
    config = aai.TranscriptionConfig(speaker_labels=True)
    transcriber = aai.Transcriber()

    # Start the transcription
    transcript = transcriber.transcribe(audio_file_path, config=config)

    # Check for errors
    if transcript.status == aai.TranscriptStatus.error:
        print(f"Transcription failed: {transcript.error}")
    else:
        # Build the JSON structure
        json_output = {
            "meta_data": {
                "og_file_name": os.path.basename(audio_file_path),
                "file_path": audio_file_path,
                "date_transcribed": datetime.now().isoformat(),
                "duration_seconds": transcript.audio_duration
            },
            "transcript": []
        }

        # Populate transcript list with converted timestamps
        if transcript.utterances:
            for u in transcript.utterances:
                json_output["transcript"].append({
                    "speaker": u.speaker,
                    "text": u.text,
                    "start_time": u.start / 1000.0, # Convert ms to seconds
                    "end_time": u.end / 1000.0      # Convert ms to seconds
                })
        else:
            json_output["transcript"].append({
                "speaker": None,
                "text": transcript.text,
                "start_time": 0.0,
                "end_time": transcript.audio_duration
            })

        # Define output filename
        base_name = os.path.splitext(os.path.basename(audio_file_path))[0]
        output_path = os.path.join(output_dir, f"{base_name}.json")

        # Save to file
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(json_output, f, indent=2, ensure_ascii=False)
            
        print(f"\n✅ Transcript saved to: {output_path}")

Transcribing: ../podcast_downloads/S2E282 The SAT Isn’t About Intelligence… Here’s How to Win (Part 1).mp3

✅ Transcript saved to: ../transcripts/S2E282 The SAT Isn’t About Intelligence… Here’s How to Win (Part 1).json


In [7]:
import pprint

In [9]:
pprint.pprint(transcript.utterances)

[Utterance(text="We are about to dive into one of the most talked about and sometimes misunderstood parts of the college admissions process. It's the dreaded sat. And joining me today is my special co host, Elle. Elle is a longtime veteran Alpha student. She is going to talk all about our Alpha philosophy when it comes to the SAT and why we think the SAT isn't really about how smart you are. It's not like an IQ test. We're going to explain why and how anyone can can be in the top 10%, not just top students. Top 10%, for your knowledge, is 1350. And that's something that we say at Alpha. We can help deliver our kids. So, Elle, you've got some wild claims about the sat. Let's get into it.", start=640, end=44970, confidence=0.9769843, speaker='A', channel=None, words=[UtteranceWord(text='We', start=640, end=760, confidence=0.9946289, speaker='A', channel=None), UtteranceWord(text='are', start=760, end=960, confidence=0.47192383, speaker='A', channel=None), UtteranceWord(text='about', star

In [3]:
import os
import re
import json
import feedparser

def sanitize_filename(filename):
    clean_name = re.sub(r'[\\/*?:"<>|]', "", filename)
    clean_name = "".join(ch for ch in clean_name if ord(ch) >= 32)
    return clean_name.strip()

RSS_FEED_URL = "https://rss.art19.com/future-of-education"
METADATA_DIR = "test_metadata_output"

# Create output directory
if not os.path.exists(METADATA_DIR):
    os.makedirs(METADATA_DIR)
    print(f"Created directory: {METADATA_DIR}")

print(f"Parsing RSS feed: {RSS_FEED_URL}")
feed = feedparser.parse(RSS_FEED_URL)
print(f"Found {len(feed.entries)} episodes.\n")

# Process the first 5 episodes for testing
for entry in feed.entries[:5]:
    title = entry.get('title', 'Unknown Title')
    clean_title = sanitize_filename(title)
    
    # Extract specific fields requested
    metadata = {
        "title": title,
        "summary": entry.get("summary"),
        "published": entry.get("published"),
        # Extract href from image dict if available
        "image": entry.get("image", {}).get("href") if isinstance(entry.get("image"), dict) else entry.get("image"),
        "links": entry.get("links", []),
    }

    # Handle tags (list of objects -> list of strings)
    if "tags" in entry:
        metadata["tags"] = [t.get("term") for t in entry.tags]
    
    # Handle content (list of objects -> list of values)
    if "content" in entry:
        metadata["content"] = [c.get("value") for c in entry.content]
        
    # Additional useful iTunes fields
    if "itunes_episode" in entry:
        metadata["itunes_episode"] = entry.get("itunes_episode")
    if "itunes_season" in entry:
        metadata["itunes_season"] = entry.get("itunes_season")
    if "itunes_duration" in entry:
        metadata["itunes_duration"] = entry.get("itunes_duration")

    # Save to JSON
    filename = f"{clean_title}.json"
    filepath = os.path.join(METADATA_DIR, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    
    print(f"Saved metadata for: {title}")

print(f"\nCheck the '{METADATA_DIR}' directory for results.")

Parsing RSS feed: https://rss.art19.com/future-of-education
Found 259 episodes.

Saved metadata for: S2E283: The Truth About the SAT: Mindset Matters More Than IQ (Part 2)
Saved metadata for: S2E282: The SAT Isn’t About Intelligence… Here’s How to Win (Part 1)
Saved metadata for: S2E281: (Part 2) The Skill Kids Need but Rarely Learn… Feedback 101 with Guide Emily
Saved metadata for: S2E280: Building Capable Kids Through The Feedback Formula - With Guide Emily (Part 1)
Saved metadata for: S2E279: Does Alpha Go Too Far? We Tackle Your Harshest Criticisms (Part 2)

Check the 'test_metadata_output' directory for results.
