# Download Audio from CSV and Transcribe

In [4]:
import csv
import os
import re
import yt_dlp
from pydub import AudioSegment
import whisper
from datetime import timedelta, datetime
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

def download_audio(url, filename):
    temp_filename = filename.replace('.mp3', '') + ".temp.mp3"
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': temp_filename.replace('.mp3', ''),  # Use the specified temp filename without .mp3
        'ffmpeg_location': '/usr/local/bin',  # Specify the path to ffmpeg if needed
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

    # Rename temp file to final filename
    if os.path.exists(temp_filename):
        os.rename(temp_filename, filename)

    # Get the duration of the audio file
    audio = AudioSegment.from_file(filename, format="mp3")
    duration_minutes = len(audio) / 60000  # Convert milliseconds to minutes
    return duration_minutes

def split_audio(file_path, chunk_length_ms=300000):  # 5 minutes = 300,000 ms
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    audio = AudioSegment.from_mp3(file_path)
    chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
    chunk_files = []
    for i, chunk in enumerate(chunks):
        chunk_filename = os.path.join('data/audio/split', f"{os.path.basename(file_path)[:-4]}_chunk{i}.mp3")
        chunk.export(chunk_filename, format="mp3")
        chunk_files.append(chunk_filename)
    return chunk_files

def clean_title(title):
    # Remove brackets and anything inside them
    title = re.sub(r'\[.*?\]', '', title)
    # Remove anything after and including the pipe symbol
    title = re.split(r'\|', title)[0]
    # Replace spaces and hyphens with underscores
    title = title.replace(' ', '_').replace('-', '_')
    # Remove any non-alphanumeric characters except underscores
    title = re.sub(r'[^a-zA-Z0-9_]', '', title)
    # Remove trailing underscores
    title = title.rstrip('_')
    return title.lower()

def format_timestamp(seconds):
    return str(timedelta(seconds=seconds))

def transcribe_audio(file_path):
    model = whisper.load_model("base")
    result = model.transcribe(file_path)
    segments = result['segments']

    transcript = ""
    for segment in segments:
        start_time = format_timestamp(segment['start'])
        end_time = format_timestamp(segment['end'])
        text = segment['text']
        transcript += f"{start_time} - {end_time}::{text}\n"

    return transcript

def save_chunk_transcript(transcript, chunk_filename):
    chunk_transcript_path = os.path.join('data/transcripts/chunks', f"{os.path.basename(chunk_filename)[:-4]}.txt")
    with open(chunk_transcript_path, "w") as f:
        f.write(transcript)
    return chunk_transcript_path

def transcribe_chunks(chunk_files, formatted_title):
    full_transcript = ""
    for i, chunk_file in enumerate(sorted(chunk_files, key=lambda x: int(re.search(r'_chunk(\d+)', x).group(1)))):
        chunk_transcript_path = os.path.join('data/transcripts/chunks', f"{os.path.basename(chunk_file)[:-4]}.txt")
        if os.path.exists(chunk_transcript_path):
            print(f"Transcript for {chunk_file} already exists. Skipping transcription...")
            with open(chunk_transcript_path, "r") as f:
                transcript = f.read()
        else:
            transcript = transcribe_audio(chunk_file)
            save_chunk_transcript(transcript, chunk_file)
        full_transcript += adjust_timestamps(transcript, i * 300) + "\n"
    return full_transcript

def adjust_timestamps(transcript, offset_seconds):
    adjusted_transcript = ""
    for line in transcript.split("\n"):
        if line.strip():
            try:
                time_range, text = line.split("::", 1)
                start_time, end_time = time_range.split(" - ")
                adjusted_start_time = format_timestamp(convert_timestamp(start_time).total_seconds() + offset_seconds)
                adjusted_end_time = format_timestamp(convert_timestamp(end_time).total_seconds() + offset_seconds)
                adjusted_transcript += f"{adjusted_start_time} - {adjusted_end_time}::{text.strip()}\n"
            except ValueError as e:
                print(f"Skipping line due to unexpected format: {line}. Error: {e}")  # Debug print
                adjusted_transcript += line + "\n"
    return adjusted_transcript

def convert_timestamp(timestamp):
    parts = timestamp.split(":")
    if len(parts) == 2:  # MM:SS format
        m, s = map(float, parts)
        h = 0
    elif len(parts) == 3:  # HH:MM:SS format
        h, m, s = map(float, parts)
    else:
        raise ValueError(f"Invalid timestamp format: {timestamp}")
    return timedelta(hours=h, minutes=m, seconds=s)

def extract_date_from_posted_date(posted_date):
    # Extract the date part from 'YYYY-MM-DD HH:MM:SS' format and convert it to 'YYYYMMDD'
    return datetime.strptime(posted_date.split()[0], '%Y-%m-%d').strftime('%Y%m%d')

def update_csv_with_duration(csv_path, formatted_title, duration):
    temp_file = csv_path + '.tmp'
    with open(csv_path, 'r', newline='') as csvfile, open(temp_file, 'w', newline='') as tempcsv:
        reader = csv.DictReader(csvfile)
        fieldnames = reader.fieldnames if 'duration' in reader.fieldnames else reader.fieldnames + ['duration']
        writer = csv.DictWriter(tempcsv, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in reader:
            row_title = f"{extract_date_from_posted_date(row['date_posted'])}_{clean_title(row['formatted_title'])}"
            if row_title == formatted_title:
                row['duration'] = duration
            writer.writerow(row)
    os.replace(temp_file, csv_path)

def process_episode(url, episode_title, posted_date, csv_path):
    # Clean the episode title to create a safe filename
    formatted_title = clean_title(episode_title)
    date_prefix = extract_date_from_posted_date(posted_date)
    full_title = f"{date_prefix}_{formatted_title}"
    filename = os.path.join('data/audio/full', f"{full_title}.mp3")
    transcript_filename = os.path.join('data/transcripts/full', f"{full_title}.txt")
    
    # Skip if transcript already exists
    if os.path.exists(transcript_filename):
        print(f"Transcript for {full_title} already exists. Skipping...")
        return
    
    # Check if audio file already exists
    if os.path.exists(filename):
        print(f"Audio for {full_title} already exists. Skipping download...")
        audio = AudioSegment.from_file(filename, format="mp3")
        duration_minutes = len(audio) / 60000
    else:
        # Download the audio and get duration
        duration_minutes = download_audio(url, filename)
        update_csv_with_duration(csv_path, full_title, duration_minutes)
    
    # Calculate expected number of chunks
    expected_chunks = (int(duration_minutes) // 5) + 1

    # Check if split files already exist
    expected_chunk_filename = os.path.join('data/audio/split', f"{full_title}_chunk0.mp3")
    chunk_files = []
    if os.path.exists(expected_chunk_filename):
        print(f"Split files for {full_title} already exist. Skipping splitting...")
        chunk_files = sorted([os.path.join('data/audio/split', f) for f in os.listdir('data/audio/split') if f.startswith(f"{full_title}") and f.endswith('.mp3')], key=lambda x: int(re.search(r'_chunk(\d+)', x).group(1)))
        if len(chunk_files) != expected_chunks:
            print(f"Incorrect number of chunk files for {full_title}. Expected {expected_chunks}, but found {len(chunk_files)}.")
            chunk_files = split_audio(filename)
    else:
        # Split the audio into chunks
        chunk_files = split_audio(filename)
    
    # Transcribe the chunks
    full_transcript = transcribe_chunks(chunk_files, full_title)
    
    # Save the full transcript
    with open(transcript_filename, "w") as f:
        f.write(full_transcript)
    
    return chunk_files

if __name__ == "__main__":
    os.makedirs('data/audio/full', exist_ok=True)
    os.makedirs('data/audio/split', exist_ok=True)
    os.makedirs('data/transcripts', exist_ok=True)
    os.makedirs('data/transcripts/chunks', exist_ok=True)
    
    # Read the CSV file and filter rows with non-empty youtube_url
    csv_path = 'data/episodes.csv'
    with open(csv_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        filtered_rows = [row for row in reader if row['youtube_url']]
        
    for i, row in enumerate(filtered_rows):
        episode_url = row['youtube_url']
        episode_title = row['formatted_title']
        posted_date = row['date_posted']
        process_episode(episode_url, episode_title, posted_date, csv_path)


Transcript for 20240729_78_cream already exists. Skipping...
Transcript for 20240725_reasons_to_live already exists. Skipping...
Transcript for 20240722_77_what_is_the_wilderness_ft_joe_pera already exists. Skipping...
Transcript for 20240718_the_notion_of_free_will already exists. Skipping...
Transcript for 20240715_76_political_violnce already exists. Skipping...
Transcript for 20240711_heated_interview already exists. Skipping...
Transcript for 20240708_75_auras already exists. Skipping...
Transcript for 20240704_manners already exists. Skipping...
Transcript for 20240701_74_patriotism_jingoistically_speaking already exists. Skipping...
Transcript for 20240627_heat already exists. Skipping...
Transcript for 20240624_73_do_video_games_make_you_better_lovers already exists. Skipping...
Transcript for 20240620_a_cure_for_thirst already exists. Skipping...
Transcript for 20240617_72_lunch_pt_2 already exists. Skipping...
Transcript for 20240613_the_notion_of_glass already exists. Skippi