In [None]:
import json
import csv
from tqdm import tqdm

# New mapping dictionaries:
lookback_mapping = {
    "1-Year in review": "one year",
    "6-Month lookback": "6 month",
    "Quarter in review": "3 month",
    "2-Month lookback": "2 month",
    "1-Month in review": "1 month"
}

interval_mapping = {
    "1-Year in review": "year",
    "6-Month lookback": "6 months",
    "Quarter in review": "3 months",
    "2-Month lookback": "2 months",
    "1-Month in review": "1 month"
}

def safe_json_loads(line):
    """Safely load JSON, returning None if there's an error."""
    try:
        return json.loads(line.strip())
    except json.JSONDecodeError as e:
        print(f"Warning: Could not parse JSON line: {e}")
        return None

def create_guest_lookup(guest_jsonl_file):
    """
    Creates a lookup dictionary where keys are episode_ids and values are guest counts.
    """
    guest_lookup = {}
    try:
        with open(guest_jsonl_file, "r", encoding="utf-8") as gf:
            for line_num, line in enumerate(gf, 1):
                line = line.strip()
                if not line:
                    continue
                try:
                    record = json.loads(line)
                    episode_id = record.get("episode_id")
                    guest_val = int(record.get("guests", 0))
                    if episode_id is not None:
                        guest_lookup[episode_id] = guest_val
                except Exception as e:
                    print(f"Warning: Error processing guest line {line_num}: {e}")
                    continue
    except FileNotFoundError:
        print(f"Warning: Guest file {guest_jsonl_file} not found")
    return guest_lookup

def parse_duration(duration_str):
    """Converts a duration string (HH:MM:SS, MM:SS, or a raw number) into a number of seconds."""
    if not duration_str:
        return 0
    duration_str = str(duration_str).strip()
    if ":" in duration_str:
        parts = duration_str.split(":")
        if len(parts) == 3:
            h, m, s = parts
            try:
                return int(h) * 3600 + int(m) * 60 + int(s)
            except ValueError:
                return 0
        elif len(parts) == 2:
            m, s = parts
            try:
                return int(m) * 60 + int(s)
            except ValueError:
                return 0
        else:
            return 0
    else:
        try:
            return int(duration_str)
        except ValueError:
            return 0

def process_mail_merge(main_jsonl, guest_jsonl, output_csv):
    """
    Reads the main JSONL file for podcasts and aggregates guest counts from the guest JSONL file.
    """
    # Create guest lookup dictionary
    guest_lookup = create_guest_lookup(guest_jsonl)
    
    try:
        with open(main_jsonl, "r", encoding="utf-8") as infile, \
             open(output_csv, "w", newline='', encoding="utf-8") as outfile:
            
            csvwriter = csv.writer(outfile)
            csvwriter.writerow(["podcastID", "email", "podcastName", "num_episodes", 
                              "num_guests", "duration", "interval", "lookback"])
            
            # Read all podcast records from the JSONL file
            podcasts = []
            for line_num, line in enumerate(infile, 1):
                podcast = safe_json_loads(line)
                if podcast is not None:
                    podcasts.append(podcast)
                else:
                    print(f"Warning: Skipping invalid JSON at line {line_num}")
            
            print(f"Processing {len(podcasts)} podcasts...")
            
            last_row = None
            for podcast in tqdm(podcasts, desc="Processing podcasts"):
                podcastID = podcast.get("podcast_id", "")
                email = podcast.get("email", "")
                podcastName = podcast.get("title", "")
                num_episodes = podcast.get("num_episodes", 0)
                original_interval = podcast.get("review_label", "")
                
                interval = interval_mapping.get(original_interval, original_interval)
                lookback = lookback_mapping.get(original_interval, original_interval)
                
                episodes = podcast.get("episodes", [])
                
                # Calculate total guests only for episodes in prod_db.jsonl
                total_guests = 0
                for episode in episodes:
                    episode_id = episode.get("episode_id")
                    if episode_id in guest_lookup:
                        total_guests += guest_lookup[episode_id]
                
                total_duration_seconds = sum(parse_duration(episode.get("duration", "0")) for episode in episodes)
                total_hours = total_duration_seconds / 3600.0
                duration_text = f"{total_hours:.2f} hours of audio"
                
                row_data = [
                    podcastID, email, podcastName, num_episodes, 
                    total_guests, duration_text, interval, lookback
                ]
                csvwriter.writerow(row_data)
                last_row = row_data
            
            # Add test rows
            if last_row:
                test_emails = [
                    'josecerv@wharton.upenn.edu',
                    'csclark@princeton.edu',
                    'mohsen.mosleh@gmail.com',
                    'kmilkman@wharton.upenn.edu'
                ]
                
                for test_email in test_emails:
                    test_row = last_row.copy()
                    test_row[1] = test_email
                    csvwriter.writerow(test_row)
                    print(f"Test row added with email '{test_email}'")
        
        print(f"CSV written successfully to {output_csv}")
    
    except FileNotFoundError:
        print(f"Error: Input file {main_jsonl} not found")
    except Exception as e:
        print(f"Error processing files: {e}")

# Example usage:
if __name__ == "__main__":
    main_jsonl_file = "prod_db.jsonl"
    guest_jsonl_file = "guests-extract.jsonl"
    output_file = "podcast_mail_merge_final.csv"
    process_mail_merge(main_jsonl_file, guest_jsonl_file, output_file)


Processing 66042 podcasts...


Processing podcasts: 100%|██████████| 66042/66042 [00:00<00:00, 84583.04it/s]


Test row added with email 'josecerv@wharton.upenn.edu'
Test row added with email 'csclark@princeton.edu'
Test row added with email 'mohsen.mosleh@gmail.com'
Test row added with email 'kmilkman@wharton.upenn.edu'
CSV written successfully to podcast_mail_merge_final.csv


In [3]:
import json
import csv
import os
from tqdm import tqdm
import pandas as pd

# Existing mapping dictionaries
lookback_mapping = {
    "1-Year in review": "one year",
    "6-Month lookback": "6 month",
    "Quarter in review": "3 month",
    "2-Month lookback": "2 month",
    "1-Month in review": "1 month"
}

interval_mapping = {
    "1-Year in review": "year",
    "6-Month lookback": "6 months",
    "Quarter in review": "3 months",
    "2-Month lookback": "2 months",
    "1-Month in review": "1 month"
}

# Statuses to exclude
EXCLUDE_STATUSES = [
    "Session Expired", 
    "Survey Started", 
    "Survey Partially Finished", 
    "Survey Finished", 
    "Email Soft Bounce", 
    "Email Hard Bounce", 
    "Email Failed"
]

def safe_json_loads(line):
    """Safely load JSON, returning None if there's an error."""
    try:
        return json.loads(line.strip())
    except json.JSONDecodeError as e:
        print(f"Warning: Could not parse JSON line: {e}")
        return None

def create_guest_lookup(guest_jsonl_file):
    """Creates a lookup dictionary where keys are episode_ids and values are guest counts."""
    guest_lookup = {}
    try:
        with open(guest_jsonl_file, "r", encoding="utf-8") as gf:
            for line_num, line in enumerate(gf, 1):
                line = line.strip()
                if not line:
                    continue
                try:
                    record = json.loads(line)
                    episode_id = record.get("episode_id")
                    guest_val = int(record.get("guests", 0))
                    if episode_id is not None:
                        guest_lookup[episode_id] = guest_val
                except Exception as e:
                    print(f"Warning: Error processing guest line {line_num}: {e}")
                    continue
    except FileNotFoundError:
        print(f"Warning: Guest file {guest_jsonl_file} not found")
    return guest_lookup

def parse_duration(duration_str):
    """Converts a duration string (HH:MM:SS, MM:SS, or a raw number) into a number of seconds."""
    if not duration_str:
        return 0
    duration_str = str(duration_str).strip()
    if ":" in duration_str:
        parts = duration_str.split(":")
        if len(parts) == 3:
            h, m, s = parts
            try:
                return int(h) * 3600 + int(m) * 60 + int(s)
            except ValueError:
                return 0
        elif len(parts) == 2:
            m, s = parts
            try:
                return int(m) * 60 + int(s)
            except ValueError:
                return 0
        else:
            return 0
    else:
        try:
            return int(duration_str)
        except ValueError:
            return 0

def get_email_response_exclusions(response_file_paths):
    """Extracts email addresses from email response CSV files."""
    excluded_emails = set()
    
    for file_path in response_file_paths:
        if not os.path.exists(file_path):
            print(f"Warning: Email response file {file_path} not found.")
            continue
            
        print(f"Reading email responses from {file_path}...")
        
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    if 'From: (Address)' in row:
                        email = row['From: (Address)'].lower().strip()
                        if email and "@" in email:  # Basic email validation
                            excluded_emails.add(email)
        except Exception as e:
            print(f"Error reading email response file {file_path}: {e}")
    
    print(f"Found {len(excluded_emails)} emails to exclude from email response files.")
    return excluded_emails

def get_unsubscribed_emails(unsub_files):
    """Get emails from unsubscribe files where Unsubscribed==1."""
    excluded_emails = set()
    
    for file_path in unsub_files:
        if not os.path.exists(file_path):
            print(f"Warning: Unsubscribe file {file_path} not found.")
            continue
            
        print(f"Reading unsubscribe data from {file_path}...")
        
        try:
            df = pd.read_csv(file_path)
            if 'Email' in df.columns and 'Unsubscribed' in df.columns:
                # Get emails where Unsubscribed == 1
                unsubscribed = df[df['Unsubscribed'] == 1]['Email'].str.lower().str.strip()
                excluded_emails.update(unsubscribed)
                print(f"Found {len(unsubscribed)} unsubscribed emails in {file_path}.")
            else:
                print(f"Warning: Required columns (Email, Unsubscribed) not found in {file_path}")
        except Exception as e:
            print(f"Error reading unsubscribe file {file_path}: {e}")
    
    return excluded_emails

def get_excluded_emails():
    """Get emails to exclude from all sources."""
    excluded_emails = set()
    
    # 1. Process survey responses (both surveys)
    survey_files = ["email_one_survey.csv", "email_two_survey.csv"]
    for survey_file in survey_files:
        if os.path.exists(survey_file):
            try:
                survey_df = pd.read_csv(survey_file)
                if 'podcast_email' in survey_df.columns:
                    # Add all valid podcast emails from survey responses to exclusion list
                    emails = survey_df['podcast_email'].dropna().str.lower().str.strip()
                    excluded_emails.update(emails)
                    print(f"Found {len(emails)} emails to exclude from {survey_file}.")
                else:
                    print(f"Warning: 'podcast_email' column not found in {survey_file}")
            except Exception as e:
                print(f"Error reading survey file {survey_file}: {e}")
        else:
            print(f"Warning: Survey file {survey_file} not found.")
    
    # 2. Process distribution status (both distributions)
    distribution_files = ["email_one_distribution.csv", "email_two_distribution.csv"]
    for distribution_file in distribution_files:
        if os.path.exists(distribution_file):
            try:
                distribution_df = pd.read_csv(distribution_file)
                if 'Email Address' in distribution_df.columns and 'Status' in distribution_df.columns:
                    # Filter for the statuses we want to exclude
                    status_mask = distribution_df['Status'].isin(EXCLUDE_STATUSES)
                    emails_to_exclude = distribution_df.loc[status_mask, 'Email Address'].str.lower().str.strip()
                    excluded_emails.update(emails_to_exclude)
                    print(f"Found {len(emails_to_exclude)} emails to exclude based on distribution status in {distribution_file}.")
                    
                    # Print count for each excluded status
                    status_counts = distribution_df[status_mask]['Status'].value_counts()
                    print(f"Exclusion counts by status in {distribution_file}:")
                    for status, count in status_counts.items():
                        print(f"- {status}: {count}")
                else:
                    print(f"Warning: Required columns not found in {distribution_file}")
            except Exception as e:
                print(f"Error reading distribution file {distribution_file}: {e}")
        else:
            print(f"Warning: Distribution file {distribution_file} not found.")
    
    # 3. Process unsubscribe files
    unsub_files = ["email_one_unsub.csv", "email_two_unsub.csv"]
    unsub_exclusions = get_unsubscribed_emails(unsub_files)
    excluded_emails.update(unsub_exclusions)
    print(f"Found {len(unsub_exclusions)} emails to exclude from unsubscribe files.")
    
    # 4. Process email response files (consolidated across all emails)
    email_response_files = ["email_responses.csv", "email_auto-responses.csv"]
    response_exclusions = get_email_response_exclusions(email_response_files)
    excluded_emails.update(response_exclusions)
    
    print(f"Total unique emails to exclude: {len(excluded_emails)}")
    return excluded_emails

def process_mail_merge_with_exclusions(main_jsonl, guest_jsonl, output_csv, exclude_emails):
    """Processes mail merge while excluding specified emails."""
    # Create guest lookup dictionary
    guest_lookup = create_guest_lookup(guest_jsonl)
    
    # Track counts for reporting
    total_podcasts = 0
    excluded_podcasts = 0
    included_podcasts = 0
    
    try:
        with open(main_jsonl, "r", encoding="utf-8") as infile, \
             open(output_csv, "w", newline='', encoding="utf-8") as outfile:
            
            csvwriter = csv.writer(outfile)
            csvwriter.writerow(["podcastID", "email", "podcastName", "num_episodes", 
                              "num_guests", "duration", "interval", "lookback"])
            
            # Read all podcast records from the JSONL file
            podcasts = []
            for line_num, line in enumerate(infile, 1):
                podcast = safe_json_loads(line)
                if podcast is not None:
                    podcasts.append(podcast)
                else:
                    print(f"Warning: Skipping invalid JSON at line {line_num}")
            
            total_podcasts = len(podcasts)
            print(f"Processing {total_podcasts} podcasts...")
            
            last_row = None
            for podcast in tqdm(podcasts, desc="Processing podcasts"):
                podcastID = podcast.get("podcast_id", "")
                email = podcast.get("email", "").lower().strip()
                
                # Skip this podcast if email is in exclusion list
                if email in exclude_emails:
                    excluded_podcasts += 1
                    continue
                    
                included_podcasts += 1
                podcastName = podcast.get("title", "")
                num_episodes = podcast.get("num_episodes", 0)
                original_interval = podcast.get("review_label", "")
                
                interval = interval_mapping.get(original_interval, original_interval)
                lookback = lookback_mapping.get(original_interval, original_interval)
                
                episodes = podcast.get("episodes", [])
                
                # Calculate total guests
                total_guests = 0
                for episode in episodes:
                    episode_id = episode.get("episode_id")
                    if episode_id in guest_lookup:
                        total_guests += guest_lookup[episode_id]
                
                # Calculate total duration
                total_duration_seconds = sum(parse_duration(episode.get("duration", "0")) for episode in episodes)
                total_hours = total_duration_seconds / 3600.0
                duration_text = f"{total_hours:.2f} hours of audio"
                
                row_data = [
                    podcastID, email, podcastName, num_episodes, 
                    total_guests, duration_text, interval, lookback
                ]
                csvwriter.writerow(row_data)
                last_row = row_data
            
            # Add test rows
            if last_row:
                test_emails = [
                    'josecerv@wharton.upenn.edu',
                    'csclark@princeton.edu',
                    'mohsen.mosleh@gmail.com',
                    'kmilkman@wharton.upenn.edu'
                ]
                
                for test_email in test_emails:
                    test_row = last_row.copy()
                    test_row[1] = test_email
                    csvwriter.writerow(test_row)
                    print(f"Test row added for {test_email}")
                
                included_podcasts += len(test_emails)
        
        print(f"CSV written successfully to {output_csv}")
        print(f"\nSummary:")
        print(f"Total podcasts processed: {total_podcasts}")
        print(f"Excluded podcasts: {excluded_podcasts}")
        print(f"Included podcasts (including test emails): {included_podcasts}")
        print(f"Total emails in exclusion list: {len(exclude_emails)}")
        
        # Calculate percentage excluded
        if total_podcasts > 0:
            exclude_pct = (excluded_podcasts / total_podcasts) * 100
            print(f"Percentage of podcasts excluded: {exclude_pct:.2f}%")
    
    except FileNotFoundError:
        print(f"Error: Input file {main_jsonl} not found")
    except Exception as e:
        print(f"Error processing files: {e}")

def write_exclusion_list(exclude_emails, output_file="email_exclusions.txt"):
    """Writes the exclusion list to a file for reference."""
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            for email in sorted(exclude_emails):
                f.write(f"{email}\n")
        print(f"Exclusion list written to {output_file}")
    except Exception as e:
        print(f"Error writing exclusion list: {e}")

def main():
    # Define input files with updated names
    main_jsonl_file = "prod_db3.jsonl"
    guest_jsonl_file = "guests-extract-combined-v2.jsonl"
    output_file = "final_reminder.csv"
    
    # Get exclusions from all sources
    all_exclusions = get_excluded_emails()
    
    # Write exclusion list to file for reference
    write_exclusion_list(all_exclusions)
    
    # Process mail merge with exclusions
    process_mail_merge_with_exclusions(main_jsonl_file, guest_jsonl_file, output_file, all_exclusions)

if __name__ == "__main__":
    main()


Found 3821 emails to exclude from email_one_survey.csv.
Found 2010 emails to exclude from email_two_survey.csv.
Found 7234 emails to exclude based on distribution status in email_one_distribution.csv.
Exclusion counts by status in email_one_distribution.csv:
- Survey Partially Finished: 2585
- Email Hard Bounce: 1973
- Survey Finished: 1101
- Email Soft Bounce: 903
- Session Expired: 671
- Email Failed: 1
Found 112 emails to exclude based on distribution status in email_two_distribution.csv.
Exclusion counts by status in email_two_distribution.csv:
- Email Soft Bounce: 66
- Email Hard Bounce: 46
Reading unsubscribe data from email_one_unsub.csv...
Found 949 unsubscribed emails in email_one_unsub.csv.
Reading unsubscribe data from email_two_unsub.csv...
Found 1131 unsubscribed emails in email_two_unsub.csv.
Found 1791 emails to exclude from unsubscribe files.
Reading email responses from email_responses.csv...
Reading email responses from email_auto-responses.csv...
Found 1892 emails to

Processing podcasts: 100%|██████████| 65563/65563 [00:00<00:00, 104383.03it/s]


Test row added for josecerv@wharton.upenn.edu
Test row added for csclark@princeton.edu
Test row added for mohsen.mosleh@gmail.com
Test row added for kmilkman@wharton.upenn.edu
CSV written successfully to final_reminder.csv

Summary:
Total podcasts processed: 65563
Excluded podcasts: 11772
Included podcasts (including test emails): 53795
Total emails in exclusion list: 12221
Percentage of podcasts excluded: 17.96%
