In [None]:
import pickle
import pandas as pd
import os

def export_progress_to_csv():
    progress_file = 'progress.pkl'
    output_csv_file = 'guest_recommendations.csv'

    if not os.path.exists(progress_file):
        print(f"Progress file '{progress_file}' not found.")
        return

    # Load progress data
    with open(progress_file, 'rb') as f:
        progress = pickle.load(f)

    # Get the output_data from progress
    output_data = progress.get('output_data', [])

    if not output_data:
        print("No output data found in progress file.")
        return

    # Convert output_data to DataFrame
    output_df = pd.DataFrame(output_data)

    # Save DataFrame to CSV
    output_df.to_csv(output_csv_file, index=False)
    print(f"CSV file '{output_csv_file}' has been created from progress data.")

if __name__ == '__main__':
    export_progress_to_csv()


In [1]:
import jsonlines
import pandas as pd
import os

def populate_ids_from_guest_name():
    # File paths
    guest_csv_file = 'guest_recommendations.csv'
    guests_extract_file = 'guests-extract.jsonl'
    output_csv_file = 'guest_recommendations_with_ids.csv'

    # Check if necessary files exist
    if not os.path.exists(guest_csv_file):
        print(f"Guest recommendations CSV file '{guest_csv_file}' not found.")
        return

    if not os.path.exists(guests_extract_file):
        print(f"Guests extract JSONL file '{guests_extract_file}' not found.")
        return

    # Read guest recommendations CSV
    guest_df = pd.read_csv(guest_csv_file)

    # Ensure 'guest_name' column exists
    if 'guest_name' not in guest_df.columns:
        print("Error: 'guest_name' column is required in 'guest_recommendations.csv'.")
        return

    # Read guests-extract.jsonl and build a lookup dictionary
    print("Loading guests-extract.jsonl...")
    guest_lookup = {}  # Key: guest_name (normalized), Value: list of tuples (podcast_id, episode_id)

    with jsonlines.open(guests_extract_file) as reader:
        for obj in reader:
            podcast_id = obj.get('podcast_id')
            episode_id = obj.get('episode_id')
            guests = obj.get('guests', [])
            for guest in guests:
                name = (guest.get('name') or '').strip().lower()
                if name:
                    # If the guest name is already in the lookup, append the new (podcast_id, episode_id)
                    if name in guest_lookup:
                        guest_lookup[name].append((podcast_id, episode_id))
                    else:
                        guest_lookup[name] = [(podcast_id, episode_id)]

    # Add 'podcast_id' and 'episode_id' columns to guest_df if not already present
    if 'podcast_id' not in guest_df.columns:
        guest_df['podcast_id'] = ''
    if 'episode_id' not in guest_df.columns:
        guest_df['episode_id'] = ''

    # Populate 'podcast_id' and 'episode_id' in guest_df
    print("Populating podcast_id and episode_id...")
    unmatched_guests = []
    for idx, row in guest_df.iterrows():
        guest_name = (row['guest_name'] or '').strip().lower()
        matches = guest_lookup.get(guest_name)
        if matches:
            # Take the first match (Option 1)
            podcast_id, episode_id = matches[0]
            guest_df.at[idx, 'podcast_id'] = podcast_id
            guest_df.at[idx, 'episode_id'] = episode_id
        else:
            unmatched_guests.append(row['guest_name'])

    if unmatched_guests:
        print(f"Warning: Could not find matches for the following guests:")
        for name in unmatched_guests:
            print(f" - {name}")

    # Save the updated DataFrame to a new CSV file
    guest_df.to_csv(output_csv_file, index=False)
    print(f"CSV file '{output_csv_file}' has been created with populated podcast_id and episode_id.")

if __name__ == '__main__':
    populate_ids_from_guest_name()


Loading guests-extract.jsonl...
Populating podcast_id and episode_id...
CSV file 'guest_recommendations_with_ids.csv' has been created with populated podcast_id and episode_id.


  guest_df.at[idx, 'podcast_id'] = podcast_id
  guest_df.at[idx, 'episode_id'] = episode_id
