In [None]:
import jsonlines
import pandas as pd
from datetime import datetime
from dateutil import parser
from dateutil.tz import gettz
import math
from tqdm.auto import tqdm
import re

# Define tzinfos mapping
tzinfos = {
    'EST': gettz('America/New_York'),
    'EDT': gettz('America/New_York'),
    'CST': gettz('America/Chicago'),
    'CDT': gettz('America/Chicago'),
    'MST': gettz('America/Denver'),
    'MDT': gettz('America/Denver'),
    'PST': gettz('America/Los_Angeles'),
    'PDT': gettz('America/Los_Angeles'),
    'BST': gettz('Europe/London'),
    'GMT': gettz('Europe/London'),
    'CEST': gettz('Europe/Berlin'),
    'CET': gettz('Europe/Berlin'),
    'UT': gettz('UTC'),
    'UTC': gettz('UTC'),
}

def parse_date(date_str):
    if not date_str:
        return None
    try:
        dt = parser.parse(date_str, tzinfos=tzinfos)
        dt = dt.astimezone(gettz('UTC'))
        dt = dt.replace(tzinfo=None)
        return dt
    except Exception as e:
        return None

def parse_duration(duration_str):
    if pd.isna(duration_str):
        return 0
    duration_str = str(duration_str).strip()
    # If duration is in seconds format (e.g., "5675")
    if duration_str.isdigit():
        return int(duration_str)
    # If duration is in HH:MM:SS or MM:SS format
    else:
        # Use regular expression to extract numbers
        match = re.match(r'^((\d+):)?(\d+):(\d+)$', duration_str)
        if match:
            hours = int(match.group(2)) if match.group(2) else 0
            minutes = int(match.group(3))
            seconds = int(match.group(4))
            total_seconds = hours * 3600 + minutes * 60 + seconds
            return total_seconds
        else:
            # Handle cases where duration is in format like "1:16:01" or "58:40"
            parts = duration_str.split(':')
            parts = [int(p) for p in parts]
            if len(parts) == 3:
                hours, minutes, seconds = parts
            elif len(parts) == 2:
                hours = 0
                minutes, seconds = parts
            else:
                # Unrecognized format
                return 0
            total_seconds = hours * 3600 + minutes * 60 + seconds
            return total_seconds

# Step 1: Read final_podcasts.jsonl into DataFrame
podcasts_list = []
with jsonlines.open('final_podcasts.jsonl') as reader:
    for obj in reader:
        podcasts_list.append(obj)

df_podcasts = pd.DataFrame(podcasts_list)

# Step 2: Process each podcast
output_data = []

print("Processing podcasts...")
for idx, podcast in tqdm(df_podcasts.iterrows(), total=df_podcasts.shape[0], desc="Podcasts"):
    podcast_id = podcast['podcast_id']
    title = podcast['title']
    episodes = podcast['episodes']

    # Collect episode dates and durations
    episode_dates = []
    total_duration_seconds = 0

    for ep in episodes:
        # Parse episode published date
        published_at = ep.get('published_at')
        ep_date = parse_date(published_at)
        if ep_date:
            episode_dates.append(ep_date)
        # Parse duration
        duration_str = ep.get('duration')
        duration_seconds = parse_duration(duration_str)
        total_duration_seconds += duration_seconds

    if episode_dates:
        first_date = min(episode_dates)
        last_date = max(episode_dates)
        # Calculate months difference, rounded up
        days_diff = (last_date - first_date).days
        months_diff = math.ceil(days_diff / 30.44)  # Average days per month
    else:
        first_date = None
        last_date = None
        months_diff = None

    # Convert total duration to hours
    total_duration_hours = total_duration_seconds / 3600

    # Prepare data for guest count calculation
    output_data.append({
        'podcast_id': podcast_id,
        'title': title,
        'months_in_dataset': months_diff,
        'total_duration_hours': total_duration_hours,
        'episode_ids': [ep['episode_id'] for ep in episodes]
    })

# Step 3: Calculate number of guests using guests-extract.jsonl
print("Processing guests data...")
# Create a mapping of episode_id to podcast_id for quick lookup
episode_to_podcast = {}
for data in output_data:
    for ep_id in data['episode_ids']:
        episode_to_podcast[ep_id] = data['podcast_id']

# Initialize guest count per podcast
podcast_guest_counts = {data['podcast_id']: 0 for data in output_data}

# Process guests-extract.jsonl
with jsonlines.open('guests-extract.jsonl') as reader:
    for obj in tqdm(reader, desc="Guest entries"):
        podcast_id = obj.get('podcast_id')
        episode_id = obj.get('episode_id')
        guests = obj.get('guests', [])
        # Check if this episode is in our dataset
        if episode_id in episode_to_podcast and episode_to_podcast[episode_id] == podcast_id:
            # Increment guest count by number of guests in this episode
            podcast_guest_counts[podcast_id] += len(guests)

# Step 4: Compile final data
print("Compiling final data...")
final_output = []
for data in output_data:
    podcast_id = data['podcast_id']
    final_output.append({
        'Podcast Title': data['title'],
        'Months in Dataset': data['months_in_dataset'],
        'Number of Guests': podcast_guest_counts.get(podcast_id, 0),
        'Total Duration Hours': round(data['total_duration_hours'], 2)
    })

# Step 5: Save to CSV
df_final = pd.DataFrame(final_output)
df_final.to_csv('podcasts_mail_merge.csv', index=False)

print("\nData has been saved to 'podcasts_mail_merge.csv'")



Publishing Interval  First Episode Date   Last Episode Date   
half-year            2024-04-03           2024-09-13          
half-year            2024-02-15           2024-09-12          
quarter              2024-06-17           2024-09-12          
quarter              2024-06-15           2024-09-08          
half-year            2024-04-09           2024-09-09          
half-year            2024-02-29           2024-09-05          
half-year            2024-02-19           2024-09-11          
half-year            2024-02-27           2024-09-10          
half-year            2024-04-11           2024-09-11          
quarter              2024-08-01           2024-09-13          
quarter              2024-08-06           2024-09-13          
half-year            2024-03-06           2024-09-03          
half-year            2024-03-10           2024-09-09          
quarter              2024-06-01           2024-09-12          
half-year            2024-01-10           2024-09-09   