In [2]:
import json
import csv
from collections import defaultdict

def safe_get(dictionary, key, default=''):
    value = dictionary.get(key, default) if dictionary is not None else default
    return str(value).upper() if key == 'gender' else value

def calculate_percentages(podcast_data):
    host_stats = {'total': 0, 'female': 0, 'african_american': 0}
    guest_stats = {'total': 0, 'female': 0, 'african_american': 0}
    
    for episode in podcast_data:
        for host in episode.get('hosts', []):
            if host is not None:
                host_stats['total'] += 1
                if safe_get(host, 'gender') == 'F':
                    host_stats['female'] += 1
                if safe_get(host, 'African-American', False):
                    host_stats['african_american'] += 1
        
        for guest in episode.get('guests', []):
            if guest is not None:
                guest_stats['total'] += 1
                if safe_get(guest, 'gender') == 'F':
                    guest_stats['female'] += 1
                if safe_get(guest, 'African-American', False):
                    guest_stats['african_american'] += 1
    
    return {
        'host_female_percentage': (host_stats['female'] / host_stats['total'] * 100) if host_stats['total'] > 0 else 0,
        'host_african_american_percentage': (host_stats['african_american'] / host_stats['total'] * 100) if host_stats['total'] > 0 else 0,
        'guest_female_percentage': (guest_stats['female'] / guest_stats['total'] * 100) if guest_stats['total'] > 0 else 0,
        'guest_african_american_percentage': (guest_stats['african_american'] / guest_stats['total'] * 100) if guest_stats['total'] > 0 else 0
    }

# Read JSONL file
podcast_data = defaultdict(list)
with open('guests-extract.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        podcast_data[data['podcast_id']].append(data)

# Calculate percentages for each podcast
podcast_percentages = {}
for podcast_id, episodes in podcast_data.items():
    podcast_percentages[podcast_id] = calculate_percentages(episodes)

# Read CSV file and match with podcast data
genre_data = defaultdict(list)
total_podcasts = 0
with open('filtered_podcasts_with_genres.csv', 'r', encoding='utf-8', errors='ignore') as f:
    reader = csv.DictReader(f)
    for row in reader:
        podcast_id = row['podcast_id']
        if podcast_id in podcast_percentages:
            total_podcasts += 1
            genres = row['genres'].split(', ')
            for genre in genres:
                genre_data[genre].append(podcast_percentages[podcast_id])

# Calculate weighted averages for each genre
genre_averages = {}
for genre, podcasts in genre_data.items():
    total = len(podcasts)
    averages = {
        'host_female_percentage': sum(p['host_female_percentage'] for p in podcasts) / total,
        'host_african_american_percentage': sum(p['host_african_american_percentage'] for p in podcasts) / total,
        'guest_female_percentage': sum(p['guest_female_percentage'] for p in podcasts) / total,
        'guest_african_american_percentage': sum(p['guest_african_american_percentage'] for p in podcasts) / total,
        'podcast_count': total,
        'podcast_percentage': (total / total_podcasts) * 100
    }
    genre_averages[genre] = averages

# Sort genres by podcast count in descending order
sorted_genres = sorted(genre_averages.items(), key=lambda x: x[1]['podcast_count'], reverse=True)

# Write results to CSV
with open('genre_breakdown.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        'Genre', 
        'Host Female %', 'Host African-American %',
        'Guest Female %', 'Guest African-American %',
        'Podcast Count', 'Podcast %'
    ])
    
    for genre, averages in sorted_genres:
        writer.writerow([
            genre,
            f"{averages['host_female_percentage']:.2f}%",
            f"{averages['host_african_american_percentage']:.2f}%",
            f"{averages['guest_female_percentage']:.2f}%",
            f"{averages['guest_african_american_percentage']:.2f}%",
            averages['podcast_count'],
            f"{averages['podcast_percentage']:.2f}%"
        ])

print("Genre breakdown has been written to 'genre_breakdown.csv'")


Genre breakdown has been written to 'genre_breakdown.csv'
