In [None]:
"""
Podcast RSS Feed Processor
A script to process podcast RSS feeds and store episode data in PostgreSQL database.
"""

import feedparser
import pandas as pd
import threading
import queue
from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, UniqueConstraint
from sqlalchemy.orm import declarative_base, sessionmaker
from sqlalchemy.dialects.postgresql import insert
import logging
import os
from datetime import datetime
import hashlib
from concurrent.futures import ThreadPoolExecutor
import time
from typing import List, Dict, Any
from contextlib import contextmanager

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('podcast_processing.log'),
        logging.StreamHandler()  # Added for console output
    ]
)
logger = logging.getLogger(__name__)

# Database Configuration
Base = declarative_base()

class Episode(Base):
    """SQLAlchemy model for podcast episodes."""
    __tablename__ = 'episodes'
    __table_args__ = (
        UniqueConstraint('podcast_id', 'episode_id', name='_podcast_episode_uc'),
    )
    
    id = Column(Integer, primary_key=True, autoincrement=True)
    podcast_id = Column(String, index=True)
    episode_id = Column(String)
    episode_title = Column(Text)
    episode_description = Column(Text)
    episode_published = Column(DateTime)
    episode_link = Column(String)
    episode_duration = Column(String)

class DatabaseManager:
    """Manages database connections and operations."""
    def __init__(self):
        self.engine = create_engine(
            f'postgresql+psycopg2://{os.getenv("DB_USER", "jcervantez")}:'
            f'{os.getenv("DB_PASSWORD", "Cervantez12")}@'
            f'{os.getenv("DB_HOST", "localhost")}:'
            f'{os.getenv("DB_PORT", "5432")}/'
            f'{os.getenv("DB_NAME", "podcast_episodes")}',
            pool_size=30,
            max_overflow=0,
            pool_pre_ping=True
        )
        self.Session = sessionmaker(bind=self.engine)

    def init_db(self):
        """Initialize database tables."""
        Base.metadata.create_all(self.engine)

    @contextmanager
    def session_scope(self):
        """Provide a transactional scope around a series of operations."""
        session = self.Session()
        try:
            yield session
            session.commit()
        except Exception as e:
            session.rollback()
            raise
        finally:
            session.close()

class PodcastProcessor:
    """Handles the processing of podcast feeds."""
    def __init__(self, db_manager: DatabaseManager):
        self.db_manager = db_manager
        self.batch_size = 100
        self.episode_queue = queue.Queue(maxsize=1000)
        self.should_stop = threading.Event()

    @staticmethod
    def generate_episode_id(podcast_id: str, entry: Dict[str, Any]) -> str:
        """Generate a unique hash for an episode."""
        components = [
            podcast_id,
            entry.get('title', '').strip(),
            entry.get('link', '').strip(),
            str(entry.get('published_parsed', '')),
            entry.get('description', '').strip()
        ]
        unique_str = ''.join(components).encode('utf-8', 'ignore')
        return hashlib.sha256(unique_str).hexdigest()

    def process_entry(self, entry: Dict[str, Any], podcast_id: str) -> Dict[str, Any]:
        """Process a single feed entry."""
        try:
            published_parsed = entry.get('published_parsed')
            published_date = datetime(*published_parsed[:6]) if published_parsed else None

            # Get description from various possible fields
            description = entry.get('description', '')
            if not description:
                description = entry.get('summary', '')
            if not description:
                description = entry.get('itunes_summary', '')

            return {
                'podcast_id': podcast_id,
                'episode_id': self.generate_episode_id(podcast_id, entry),
                'episode_title': entry.get('title', '').strip(),
                'episode_description': description.strip(),
                'episode_published': published_date,
                'episode_link': entry.get('link', '').strip(),
                'episode_duration': entry.get('itunes_duration', '').strip()
            }
        except Exception as e:
            logger.error(f"Error processing entry: {e}")
            return None

    def process_feed(self, rss_url: str, podcast_id: str) -> None:
        """Process a single podcast feed."""
        try:
            feed = feedparser.parse(rss_url)
            if not feed.entries:
                logger.info(f"No entries found for podcast {podcast_id}")
                return

            episodes_batch = []
            for entry in feed.entries:
                episode_data = self.process_entry(entry, podcast_id)
                if episode_data:
                    episodes_batch.append(episode_data)
                
                if len(episodes_batch) >= self.batch_size:
                    self.episode_queue.put(episodes_batch)
                    episodes_batch = []

            if episodes_batch:
                self.episode_queue.put(episodes_batch)

        except Exception as e:
            logger.error(f"Error processing feed {podcast_id}: {e}")

    def batch_insert_worker(self) -> None:
        """Worker to handle batch insertions to database."""
        while not self.should_stop.is_set():
            try:
                batch = self.episode_queue.get(timeout=5)
                if batch is None:
                    self.episode_queue.task_done()
                    break

                try:
                    with self.db_manager.session_scope() as session:
                        stmt = insert(Episode).values(batch)
                        stmt = stmt.on_conflict_do_nothing(
                            index_elements=['podcast_id', 'episode_id']
                        )
                        session.execute(stmt)
                finally:
                    self.episode_queue.task_done()

            except queue.Empty:
                continue
            except Exception as e:
                logger.error(f"Error in batch insert: {e}")
                if 'batch' in locals():
                    self.episode_queue.task_done()

def process_podcasts(csv_path: str = 'podcasts_sample.csv', max_workers: int = 20):
    """Main processing function."""
    start_time = time.time()
    logger.info("Starting podcast processing")

    try:
        # Initialize components
        db_manager = DatabaseManager()
        db_manager.init_db()
        processor = PodcastProcessor(db_manager)

        # Load podcast data
        podcasts_df = pd.read_csv(csv_path, low_memory=False)
        total_podcasts = len(podcasts_df)
        logger.info(f"Found {total_podcasts} podcasts to process")

        # Start batch insert workers
        num_insert_workers = 5
        insert_workers = []
        for _ in range(num_insert_workers):
            worker = threading.Thread(target=processor.batch_insert_worker)
            worker.daemon = True
            worker.start()
            insert_workers.append(worker)

        try:
            # Process feeds
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [
                    executor.submit(processor.process_feed, row.rss, row.podcast_id)
                    for row in podcasts_df.itertuples()
                ]
                
                # Monitor progress
                completed = 0
                for future in futures:
                    try:
                        future.result(timeout=300)  # 5-minute timeout per feed
                        completed += 1
                        if completed % 100 == 0:
                            logger.info(f"Processed {completed}/{total_podcasts} feeds")
                    except Exception as e:
                        logger.error(f"Feed processing failed: {e}")

            # Wait for all tasks to be processed
            processor.episode_queue.join()

        finally:
            # Signal workers to stop
            processor.should_stop.set()
            for _ in range(num_insert_workers):
                processor.episode_queue.put(None)
            
            # Wait for workers to finish
            for worker in insert_workers:
                worker.join(timeout=5)

    except Exception as e:
        logger.error(f"Critical error: {e}")
        raise
    finally:
        processing_time = time.time() - start_time
        logger.info(f"Processing completed in {processing_time:.2f} seconds")

def main():
    """Entry point for script execution."""
    try:
        process_podcasts(
            csv_path='podcasts_sample.csv',
            max_workers=20
        )
    except KeyboardInterrupt:
        logger.info("Processing interrupted by user")
    except Exception as e:
        logger.error(f"Processing failed: {e}")

if __name__ == "__main__":
    main()


2025-01-29 08:21:07,780 - INFO - Starting podcast processing
2025-01-29 08:21:09,511 - INFO - Found 117527 podcasts to process
2025-01-29 08:21:12,625 - INFO - No entries found for podcast 98e74c75ebb745a2ac94190e177c0bef
2025-01-29 08:21:18,811 - INFO - No entries found for podcast 6129d67770c544c994893fcb6423a19f
2025-01-29 08:21:27,445 - INFO - No entries found for podcast c044d3053396405d820893646f6a0604
2025-01-29 08:21:31,726 - INFO - No entries found for podcast f1597fcde1f5409f8907084552697c00
2025-01-29 08:21:32,250 - INFO - No entries found for podcast 818dc4fd7d154f049abfefb65fb5d4e0
2025-01-29 08:21:38,675 - INFO - No entries found for podcast b4d3983f995b48c7a9bb7c0d72391eae
2025-01-29 08:21:39,234 - INFO - No entries found for podcast 188de3b7bbcf45d089d310c960f3f81d
2025-01-29 08:21:40,192 - INFO - No entries found for podcast 9cec42394af64604899261c4134d9e19
2025-01-29 08:21:41,247 - INFO - No entries found for podcast 4cf1100f7d8d48818c1ea79c97a40999
2025-01-29 08:21:4

In [1]:
import os
import re
import pandas as pd
from sqlalchemy import create_engine, text
import logging
from sqlalchemy.exc import IntegrityError

# Configure logging
logging.basicConfig(
    level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s'
)

# Database credentials
db_user = os.getenv('DB_USER', 'jcervantez')
db_password = os.getenv('DB_PASSWORD', 'Cervantez12')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')
db_name = os.getenv('DB_NAME', 'podcast_episodes')

# Create the PostgreSQL database engine
engine = create_engine(
    f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}',
)

# Define chunksize based on your system's capacity
chunksize = 100000  # Adjust as needed

def clean_date_string(date_str):
    if pd.isnull(date_str):
        return date_str
    if isinstance(date_str, str):
        # Remove unrecognized time zone abbreviations
        cleaned_str = re.sub(r'\b[A-Z]{3,5}$', '', date_str.strip())
        return cleaned_str.strip()
    else:
        # If it's already a Timestamp or datetime object, return it as is
        return date_str

def process_chunk(chunk):
    # Clean the date strings
    chunk['episode_published_cleaned'] = chunk['episode_published'].apply(clean_date_string)

    # Parse dates using pd.to_datetime with utc=True
    chunk['episode_published_parsed'] = pd.to_datetime(
        chunk['episode_published_cleaned'], errors='coerce', utc=True
    )
    
    # Drop rows where date parsing failed
    chunk = chunk.dropna(subset=['episode_published_parsed'])

    # Check if chunk is empty after dropping rows with unparseable dates
    if chunk.empty:
        return pd.DataFrame()

    # Filter episodes from 2024 and 2025
    filtered_chunk = chunk[
        (chunk['episode_published_parsed'].dt.year >= 2024) &
        (chunk['episode_published_parsed'].dt.year <= 2025)
    ]

    # Check if chunk is empty after date filtering
    if filtered_chunk.empty:
        return pd.DataFrame()

    # Remove episodes with empty descriptions
    filtered_chunk = filtered_chunk[
        filtered_chunk['episode_description'].notna() &  # Not null
        (filtered_chunk['episode_description'].str.strip() != '')  # Not empty or whitespace
    ]

    # Check if chunk is empty after filtering descriptions
    if filtered_chunk.empty:
        return pd.DataFrame()

    # Drop the temporary date columns
    filtered_chunk = filtered_chunk.drop(
        columns=['episode_published_cleaned', 'episode_published_parsed']
    )

    # Remove duplicates within the chunk
    filtered_chunk = filtered_chunk.drop_duplicates(subset=['podcast_id', 'episode_id'])

    return filtered_chunk

try:
    # Enable autocommit mode
    with engine.connect().execution_options(isolation_level="AUTOCOMMIT") as conn:
        # Drop the existing episodes_recent table to start fresh
        conn.execute(text("DROP TABLE IF EXISTS episodes_recent;"))
        logging.info("Dropped existing episodes_recent table.")

        # Create episodes_recent table with the same schema as episodes
        conn.execute(text("""
        CREATE TABLE episodes_recent AS TABLE episodes WITH NO DATA;
        """))
        logging.info("Created new episodes_recent table.")

        # Add unique constraint to episodes_recent table
        conn.execute(text("""
        ALTER TABLE episodes_recent
        ADD CONSTRAINT unique_podcast_episode UNIQUE (podcast_id, episode_id);
        """))
        logging.info("Added unique constraint to episodes_recent table.")

    # Use a generator to read data in chunks
    for chunk in pd.read_sql_table('episodes', engine, chunksize=chunksize):
        try:
            filtered_chunk = process_chunk(chunk)
            if not filtered_chunk.empty:
                # Insert data into episodes_recent
                filtered_chunk.to_sql(
                    'episodes_recent', engine, if_exists='append', index=False, method='multi'
                )
        except Exception as e:
            logging.error(f"Error processing a chunk: {e}")
except Exception as e:
    logging.error(f"Error in processing: {e}")
    exit(1)

# After processing all chunks, delete episodes from podcasts with fewer than 10 episodes
try:
    with engine.connect().execution_options(isolation_level="AUTOCOMMIT") as conn:
        # Delete episodes from podcasts with fewer than 10 episodes in episodes_recent
        delete_query = text("""
        DELETE FROM episodes_recent er
        USING (
            SELECT podcast_id
            FROM episodes_recent
            GROUP BY podcast_id
            HAVING COUNT(*) < 10
        ) pc
        WHERE er.podcast_id = pc.podcast_id;
        """)
        conn.execute(delete_query)
        logging.info("Deleted episodes from podcasts with fewer than 10 episodes.")
except Exception as e:
    logging.error(f"Error deleting episodes: {e}")
    exit(1)

# Get the total number of unique podcasts in episodes_recent
try:
    with engine.connect() as conn:
        result = conn.execute(text("""
        SELECT COUNT(DISTINCT podcast_id) FROM episodes_recent;
        """))
        unique_podcast_count = result.scalar()
        logging.info(
            f"Total number of unique podcasts in episodes_recent: {unique_podcast_count}"
        )
        print(f"Total number of unique podcasts in episodes_recent: {unique_podcast_count}")
except Exception as e:
    logging.error(f"Error retrieving unique podcast count: {e}")
    exit(1)

logging.info("episodes_recent table created successfully.")


2025-01-29 14:04:01,223 - INFO - Dropped existing episodes_recent table.
2025-01-29 14:04:01,231 - INFO - Created new episodes_recent table.
2025-01-29 14:04:01,234 - INFO - Added unique constraint to episodes_recent table.
2025-01-29 14:15:22,568 - INFO - Deleted episodes from podcasts with fewer than 10 episodes.
2025-01-29 14:15:30,585 - INFO - Total number of unique podcasts in episodes_recent: 76552
2025-01-29 14:15:30,587 - INFO - episodes_recent table created successfully.


Total number of unique podcasts in episodes_recent: 76552


In [1]:
import os
import subprocess
import gzip
from datetime import datetime
import getpass
import shlex

# Database credentials
db_user = os.getenv('DB_USER', 'jcervantez')
db_password = os.getenv('DB_PASSWORD', 'Cervantez12')
db_host = os.getenv('DB_HOST', 'localhost')
db_port = os.getenv('DB_PORT', '5432')
db_name = os.getenv('DB_NAME', 'podcast_episodes')

# Path to pg_dump executable
pg_dump_path = r'C:\Program Files\PostgreSQL\17\bin\pg_dump.exe'

# Output file name with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_sql_file = f'episodes_{timestamp}.sql'
compressed_file = f'{output_sql_file}.gz'

# Construct the pg_dump command
pg_dump_cmd = [
    pg_dump_path,
    '-h', db_host,
    '-p', db_port,
    '-U', db_user,
    '-t', 'podcasts_even',  # Specify the table
    '-F', 'p',  # Plain SQL format
    '--no-owner',
    '--no-privileges',
    db_name,
]

# Since the path may contain spaces, we need to use shell=True, but this poses security risks.
# Alternatively, we can use shlex.quote to handle paths with spaces securely.

# Environment variable for PostgreSQL password
env = os.environ.copy()
env['PGPASSWORD'] = db_password

try:
    # Run pg_dump and capture the output
    with open(output_sql_file, 'wb') as out_sql_file:
        subprocess.run(pg_dump_cmd, env=env, stdout=out_sql_file, check=True)
    print(f"Successfully exported {output_sql_file}")

    # Compress the SQL file using gzip
    with open(output_sql_file, 'rb') as f_in, gzip.open(compressed_file, 'wb') as f_out:
        f_out.writelines(f_in)
    print(f"Successfully compressed to {compressed_file}")

    # Optionally, remove the uncompressed SQL file
    os.remove(output_sql_file)
    print(f"Removed uncompressed SQL file {output_sql_file}")

except subprocess.CalledProcessError as e:
    print(f"An error occurred while running pg_dump: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Successfully exported episodes_20250131_065139.sql
Successfully compressed to episodes_20250131_065139.sql.gz
Removed uncompressed SQL file episodes_20250131_065139.sql


In [1]:
#!/usr/bin/env python3
"""
Script to debug podcast_episodes.sql for problematic encoding issues in descriptions.

This script will:
1. Open the podcast_episodes.sql file in binary mode.
2. Search for occurrences of problematic byte sequences (e.g., 0x8F)
   that might indicate problematic text in the description.
3. Print out the count and context (a snippet around the occurrence) for each.
"""

import re

def debug_sql_file(file_path):
    problematic_bytes = {
        b'\x8f': "0x8F",
        # Add other byte sequences if you suspect additional ones
    }
    
    try:
        with open(file_path, "rb") as f:
            data = f.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return

    for byte_seq, desc in problematic_bytes.items():
        matches = [m.start() for m in re.finditer(re.escape(byte_seq), data)]
        print(f"Found {len(matches)} occurrences of byte {desc}.")
        for pos in matches:
            # extract 20 bytes of context before and after the occurrence
            start = max(0, pos - 20)
            end = pos + 20
            snippet = data[start:end]
            try:
                snippet_decoded = snippet.decode("utf-8")
            except Exception as e:
                snippet_decoded = snippet.decode("utf-8", errors="replace")
            print(f"At position {pos}: ...{snippet_decoded}...")

if __name__ == "__main__":
    sql_file = "podcast_episodes_20250220_085140.sql"
    print(f"Scanning {sql_file} for problematic byte sequences...\n")
    debug_sql_file(sql_file)
    print("\nDone scanning!")


Scanning podcast_episodes_20250220_085140.sql for problematic byte sequences...

Found 0 occurrences of byte 0x8F.

Done scanning!
