In [1]:
import pandas as pd
import numpy as np
import json
import random
import feedparser
import hashlib
from collections import Counter, defaultdict
import time
import socket
import re
from datetime import datetime
import concurrent.futures
import logging

# Set up logging
logging.basicConfig(
    level=logging.ERROR,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('podcast_processing.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Set random seed for reproducibility
np.random.seed(42)

# Constants
TIMEOUT = 10  # seconds for RSS feed requests

def clean_text(text):
    """
    Clean text to be UTF-8 compliant and remove problematic characters.
    """
    try:
        if not isinstance(text, str):
            text = str(text)
        text = text.encode('utf-8', 'replace').decode('utf-8', 'replace')
        text = re.sub(r'[\x80-\x9F]', '', text)
        text = text.encode('ascii', 'ignore').decode('ascii')
        text = re.sub(r'[\u2028\u2029\u0085\u000A\u000B\u000C\u000D\u2028\u2029]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'<[^>]+>', ' ', text)
        text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
        text = re.sub(r'&#?\w+;', ' ', text)
        return text.strip()
    except Exception as e:
        logger.error(f"Error cleaning text: {e}")
        return ''

def process_rss_feed(rss_url, podcast_id):
    """
    Process an RSS feed to extract the podcast description.
    Returns a dictionary with podcast_id and podcast_description.
    """
    try:
        original_timeout = socket.getdefaulttimeout()
        socket.setdefaulttimeout(TIMEOUT)
        
        try:
            feed = feedparser.parse(rss_url)
        finally:
            socket.setdefaulttimeout(original_timeout)
            
        if not feed or not hasattr(feed, 'feed'):
            return None
            
        # Extract podcast description from the channel level
        description = feed.feed.get('description', '')
        subtitle = feed.feed.get('subtitle', '')
        summary = feed.feed.get('summary', '')
        itunes_summary = feed.feed.get('itunes_summary', '')
        
        # Combine all possible description fields for robustness
        all_descriptions = ' '.join(filter(None, [description, subtitle, summary, itunes_summary]))
        clean_description = clean_text(all_descriptions)
        
        if clean_description:
            return {
                'podcast_id': podcast_id,
                'podcast_description': clean_description
            }
        return None
    except Exception as e:
        logger.error(f"Error processing RSS feed for podcast {podcast_id}: {str(e)}")
        return None

print("Loading podcast metadata...")
# Load podcasts metadata
podcasts_df = pd.read_csv('podcasts_final_sample.csv', low_memory=False)

# Create a map of podcast_id to title and RSS URL for easy lookup
podcast_titles = dict(zip(podcasts_df['podcast_id'], podcasts_df['title']))
podcast_rss_urls = dict(zip(podcasts_df['podcast_id'], podcasts_df['rss']))

# Extract primary genres and create a mapping
podcast_genres = {}
for idx, row in podcasts_df.iterrows():
    podcast_id = row['podcast_id']
    if pd.isna(row['primary_genre']):
        continue
    genres = row['primary_genre'].split(', ')
    podcast_genres[podcast_id] = genres

# First, scan prod_db.jsonl to find which podcasts actually have episodes
print("Scanning prod_db.jsonl to identify available podcasts...")
available_podcasts = {}

# Read the file line by line in binary mode to handle encoding issues
with open('prod_db.jsonl', 'rb') as f:
    for i, line in enumerate(f):
        if i % 1000 == 0 and i > 0:
            print(f"Processed {i} lines...")
            
        try:
            # Decode with Latin-1 which can handle any byte sequence
            decoded_line = line.decode('latin-1')
            data = json.loads(decoded_line)
            
            # Check if we have a full podcast entry with episodes
            if 'podcast_id' in data and 'episodes' in data and isinstance(data['episodes'], list):
                podcast_id = data['podcast_id']
                title = data.get('title', podcast_titles.get(podcast_id, "Unknown"))
                episodes = data.get('episodes', [])
                
                # Store podcast data if it has episodes
                if len(episodes) > 0:
                    available_podcasts[podcast_id] = {
                        'title': title,
                        'episodes': episodes[:50],  # Limit to 50 episodes
                    }
        except Exception as e:
            # Just skip problematic lines
            continue

print(f"Found {len(available_podcasts)} podcasts with episodes in the JSONL file")

# Target genres we're interested in
target_genres = ["Business", "Society & Culture", "News"]
print(f"Targeting genres: {', '.join(target_genres)}")

# Identify podcasts that are both in our target genres AND have episodes
target_genre_podcasts = []
for podcast_id, genres in podcast_genres.items():
    if podcast_id in available_podcasts:
        if any(genre in target_genres for genre in genres):
            target_genre_podcasts.append(podcast_id)

print(f"Found {len(target_genre_podcasts)} podcasts in target genres with episodes")

# Sample from target genres with episodes
sample_size = min(100, len(target_genre_podcasts))
if sample_size > 0:
    selected_podcasts = np.random.choice(target_genre_podcasts, size=sample_size, replace=False)
else:
    selected_podcasts = []

# If we need more podcasts to reach 100, add others with episodes
if len(selected_podcasts) < 100:
    remaining_needed = 100 - len(selected_podcasts)
    other_available = [pid for pid in available_podcasts if pid not in selected_podcasts]
    
    additional_count = min(remaining_needed, len(other_available))
    if additional_count > 0:
        additional_podcasts = np.random.choice(other_available, size=additional_count, replace=False)
        selected_podcasts = np.append(selected_podcasts, additional_podcasts)

print(f"Selected {len(selected_podcasts)} podcasts total")

# Now that we have selected the podcasts, process only their RSS feeds
print("Processing RSS feeds for selected podcasts...")
podcast_descriptions = {}
feeds_to_process = []

for podcast_id in selected_podcasts:
    rss_url = podcast_rss_urls.get(podcast_id)
    if pd.notna(rss_url):
        feeds_to_process.append((rss_url, podcast_id))

print(f"Processing {len(feeds_to_process)} RSS feeds (only for selected podcasts)...")

# Process feeds in parallel
if feeds_to_process:
    num_workers = min(32, len(feeds_to_process))
    processed = 0
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        future_to_feed = {
            executor.submit(process_rss_feed, url, pid): (url, pid) 
            for url, pid in feeds_to_process
        }
        
        for future in concurrent.futures.as_completed(future_to_feed):
            url, pid = future_to_feed[future]
            processed += 1
            
            if processed % 10 == 0:
                print(f"Processed {processed}/{len(feeds_to_process)} RSS feeds...")
                
            try:
                result = future.result()
                if result and result.get('podcast_description'):
                    podcast_descriptions[pid] = result['podcast_description']
            except Exception as e:
                logger.error(f"Error processing RSS feed for {pid}: {str(e)}")
    
    print(f"Successfully extracted descriptions for {len(podcast_descriptions)} podcasts")

# Create the final output
final_sample = []
for podcast_id in selected_podcasts:
    if podcast_id in available_podcasts:
        podcast_data = {
            'podcast_id': podcast_id,
            'podcast_title': available_podcasts[podcast_id]['title'],
            'podcast_description': podcast_descriptions.get(podcast_id, ""),  # Include podcast description
            'episodes': available_podcasts[podcast_id]['episodes']
        }
    else:
        # Fallback (shouldn't happen with our approach)
        podcast_data = {
            'podcast_id': podcast_id,
            'podcast_title': podcast_titles.get(podcast_id, "Unknown"),
            'podcast_description': podcast_descriptions.get(podcast_id, ""),  # Include podcast description
            'episodes': []
        }
    
    final_sample.append(podcast_data)

# Check genre distribution in the final sample
sampled_genres = []
for podcast_id in selected_podcasts:
    if podcast_id in podcast_genres:
        sampled_genres.extend(podcast_genres[podcast_id])

sampled_genre_counts = Counter(sampled_genres)
print("\nGenre distribution in final sample:")
for genre, count in sampled_genre_counts.most_common():
    print(f"{genre}: {count}")

# Count podcasts with descriptions
podcasts_with_descriptions = sum(1 for p in final_sample if p['podcast_description'])
print(f"\nPodcasts with descriptions: {podcasts_with_descriptions} out of {len(final_sample)}")

# Write the final sample to the output file
with open('poli_sample.jsonl', 'w', encoding='utf-8') as f:
    for podcast_data in final_sample:
        f.write(json.dumps(podcast_data, ensure_ascii=False) + '\n')

# Statistics about the final sample
podcasts_with_episodes = sum(1 for p in final_sample if len(p['episodes']) > 0)
total_episodes = sum(len(p['episodes']) for p in final_sample)

print(f"\nSaved {len(final_sample)} podcasts to poli_sample.jsonl")
print(f"Podcasts with episodes: {podcasts_with_episodes}")
print(f"Podcasts with no episodes: {len(final_sample) - podcasts_with_episodes}")
print(f"Total episodes in the sample: {total_episodes}")


Loading podcast metadata...
Scanning prod_db.jsonl to identify available podcasts...
Processed 1000 lines...
Processed 2000 lines...
Processed 3000 lines...
Processed 4000 lines...
Processed 5000 lines...
Processed 6000 lines...
Processed 7000 lines...
Processed 8000 lines...
Processed 9000 lines...
Processed 10000 lines...
Processed 11000 lines...
Processed 12000 lines...
Processed 13000 lines...
Processed 14000 lines...
Processed 15000 lines...
Processed 16000 lines...
Processed 17000 lines...
Processed 18000 lines...
Processed 19000 lines...
Processed 20000 lines...
Processed 21000 lines...
Processed 22000 lines...
Processed 23000 lines...
Processed 24000 lines...
Processed 25000 lines...
Processed 26000 lines...
Processed 27000 lines...
Processed 28000 lines...
Processed 29000 lines...
Processed 30000 lines...
Processed 31000 lines...
Processed 32000 lines...
Processed 33000 lines...
Processed 34000 lines...
Processed 35000 lines...
Processed 36000 lines...
Processed 37000 lines...

In [3]:
import pandas as pd
import json

def create_podcast_csv():
    print("Loading podcast metadata from podcasts_final_sample.csv...")
    podcasts_df = pd.read_csv('podcasts_final_sample.csv', low_memory=False)
    
    # Create a dictionary mapping podcast_id to website
    podcast_websites = dict(zip(podcasts_df['podcast_id'], podcasts_df['website']))
    
    print("Loading podcast data from poli_sample.jsonl...")
    podcasts_data = []
    
    # Read the JSONL file
    with open('poli_sample.jsonl', 'r', encoding='utf-8') as f:
        for line in f:
            podcasts_data.append(json.loads(line))
    
    print(f"Found {len(podcasts_data)} podcasts in poli_sample.jsonl")
    
    # Create a list to store the extracted data
    csv_data = []
    
    # Extract the required information for each podcast
    for podcast in podcasts_data:
        podcast_id = podcast.get('podcast_id', '')
        
        # The 'name' column will be the same as podcast_id (assuming this is what you meant by 'name')
        name = podcast_id
        
        podcast_title = podcast.get('podcast_title', '')
        podcast_description = podcast.get('podcast_description', '')
        
        # Get the website from the dictionary we created
        podcast_link = podcast_websites.get(podcast_id, '')
        
        # Add the data to our list
        csv_data.append({
            'name': name,
            'podcast_title': podcast_title,
            'podcast_description': podcast_description,
            'podcast_link': podcast_link
        })
    
    # Convert to DataFrame
    output_df = pd.DataFrame(csv_data)
    
    # Write to CSV
    output_filename = 'podcast_metadata.csv'
    output_df.to_csv(output_filename, index=False, encoding='utf-8')
    
    print(f"Successfully created {output_filename} with {len(output_df)} rows")
    
    # Display a summary of the data
    print("\nSummary of extracted data:")
    print(f"Podcasts with titles: {output_df['podcast_title'].notna().sum()}")
    print(f"Podcasts with descriptions: {output_df['podcast_description'].notna().sum()}")
    print(f"Podcasts with links: {output_df['podcast_link'].notna().sum()}")

if __name__ == "__main__":
    create_podcast_csv()


Loading podcast metadata from podcasts_final_sample.csv...
Loading podcast data from poli_sample.jsonl...
Found 100 podcasts in poli_sample.jsonl
Successfully created podcast_metadata.csv with 100 rows

Summary of extracted data:
Podcasts with titles: 100
Podcasts with descriptions: 100
Podcasts with links: 100
