In [1]:
# Import necessary libraries
import pandas as pd
import json
from IPython.display import display, HTML

# Define file paths
csv_files = ["email_one_survey.csv", "email_two_survey.csv"]
input_jsonls = ["prod_db.jsonl", "prod_db2.jsonl"]
output_jsonl = "email_one_two_prod_db.jsonl"

# Step 1: Read the CSV files and extract the podcastID columns
print(f"Reading CSV files: {', '.join(csv_files)}")
podcast_ids = set()

for csv_file in csv_files:
    df_survey = pd.read_csv(csv_file)
    
    # Display the first few rows of each CSV to verify
    print(f"\nPreview of {csv_file}:")
    display(df_survey.head())
    
    # Extract unique podcast IDs from the CSV and add to our set
    file_podcast_ids = set(df_survey['podcastID'].dropna().unique())
    podcast_ids.update(file_podcast_ids)
    print(f"Found {len(file_podcast_ids)} unique podcast IDs in {csv_file}")

print(f"\nTotal unique podcast IDs across all CSV files: {len(podcast_ids)}")

# Step 2: Filter the JSONL files based on podcast IDs
print(f"\nFiltering JSONL files to include only matching podcast IDs...")
matched_count = 0
processed_ids = set()  # Track processed IDs to avoid duplicates

with open(output_jsonl, 'w', encoding='utf-8') as output_file:
    # Process each input JSONL file
    for input_jsonl in input_jsonls:
        file_matched_count = 0
        print(f"\nProcessing {input_jsonl}...")
        
        with open(input_jsonl, 'r', encoding='utf-8') as input_file:
            for line_num, line in enumerate(input_file, 1):
                try:
                    # Parse the JSON object
                    podcast_data = json.loads(line.strip())
                    podcast_id = podcast_data.get('podcast_id')
                    
                    # Check if podcast_id is in our list of IDs from the CSVs
                    # and hasn't been processed yet
                    if podcast_id in podcast_ids and podcast_id not in processed_ids:
                        # Write the matching podcast data to the output file
                        output_file.write(line)
                        matched_count += 1
                        file_matched_count += 1
                        processed_ids.add(podcast_id)
                        
                        # Print progress every 100 matches
                        if file_matched_count % 100 == 0:
                            print(f"  Found {file_matched_count} matching podcasts in this file so far...")
                            
                except json.JSONDecodeError:
                    print(f"Warning: Could not parse JSON at line {line_num} in {input_jsonl}")
        
        print(f"Matched {file_matched_count} podcasts from {input_jsonl}")

print(f"\nProcess complete! {matched_count} total podcasts matched and written to {output_jsonl}")

# Step 3: Verify the output file
print("\nVerifying output file...")
try:
    # Read a few lines from the output file to verify content
    output_samples = []
    with open(output_jsonl, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i < 3:  # Just show first 3 records for verification
                output_samples.append(json.loads(line))
            else:
                break
    
    if output_samples:
        print("\nSample podcasts in the output file:")
        for i, sample in enumerate(output_samples, 1):
            print(f"\nSample {i}:")
            print(f"Podcast ID: {sample.get('podcast_id')}")
            print(f"Title: {sample.get('title', '')}")
            print(f"Publisher: {sample.get('publisher', '')}")
            # Show how many episodes if available
            if 'episodes' in sample:
                print(f"Number of episodes: {len(sample.get('episodes', []))}")
    else:
        print("No records found in the output file.")
        
except Exception as e:
    print(f"Error verifying output file: {str(e)}")

print("\nScript execution completed!")


Reading CSV files: email_one_survey.csv, email_two_survey.csv

Preview of email_one_survey.csv:


Unnamed: 0.1,Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,...,listen_score,episodes_classified,cond,FL_14_DO_FL_15,FL_14_DO_FL_16,FL_35_DO_FL_36,FL_35_DO_FL_37,survey_date,survey_wave,response_time
0,1,2025-02-26 08:03:06,2025-02-26 08:06:21,IP Address,154.176.43.216,100,194,True,2025-02-26 08:06:22,R_8ukUZiiDEFW0pd7,...,-1,12.0,treatment,1.0,,,,2025-02-26,initial,2025-02-26 08:06:22
1,2,2025-02-26 08:04:10,2025-02-26 08:06:57,IP Address,14.99.133.150,100,166,True,2025-02-26 08:06:57,R_9cY87bOWMf2Oz9D,...,-1,8.0,control,,1.0,,,2025-02-26,initial,2025-02-26 08:06:57
2,3,2025-02-26 08:04:41,2025-02-26 08:07:52,IP Address,70.51.226.155,100,191,True,2025-02-26 08:07:53,R_1I6rarvQNBiG2BA,...,36,10.0,control,,,,1.0,2025-02-26,initial,2025-02-26 08:07:53
3,4,2025-02-26 08:05:49,2025-02-26 08:09:36,IP Address,129.224.200.231,100,227,True,2025-02-26 08:09:37,R_3IfVwgqB7e2F40q,...,-1,12.0,treatment,1.0,,,,2025-02-26,initial,2025-02-26 08:09:37
4,5,2025-02-26 08:04:46,2025-02-26 08:11:34,IP Address,205.178.76.237,100,407,True,2025-02-26 08:11:35,R_1QoyHQWhI86YAvS,...,32,9.0,control,,,,1.0,2025-02-26,initial,2025-02-26 08:11:35


Found 3420 unique podcast IDs in email_one_survey.csv

Preview of email_two_survey.csv:


Unnamed: 0.1,Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,...,listen_score,episodes_classified,cond,FL_14_DO_FL_15,FL_14_DO_FL_16,FL_35_DO_FL_36,FL_35_DO_FL_37,survey_date,survey_wave,response_time
0,1,2025-03-04 08:04:21,2025-03-04 08:08:49,IP Address,71.212.75.189,100,267,True,2025-03-04 08:08:49,R_5lPSQFIDlhc2uN5,...,-1,4.0,control,,1.0,,,2025-03-04,reminder,2025-03-04 08:08:49
1,2,2025-03-04 08:03:54,2025-03-04 08:09:29,IP Address,104.187.201.253,100,334,True,2025-03-04 08:09:29,R_7bXuoIWNNaMNiq5,...,-1,8.0,control,,1.0,,,2025-03-04,reminder,2025-03-04 08:09:29
2,3,2025-03-04 08:12:16,2025-03-04 08:15:33,IP Address,68.184.200.42,100,196,True,2025-03-04 08:15:33,R_6yk0mZhq3mQitu9,...,42,11.0,control,,,,1.0,2025-03-04,reminder,2025-03-04 08:15:33
3,4,2025-03-04 08:12:31,2025-03-04 08:16:14,IP Address,102.88.53.42,100,223,True,2025-03-04 08:16:15,R_8QW4LPLloEQfKKZ,...,-1,8.0,control,,1.0,,,2025-03-04,reminder,2025-03-04 08:16:15
4,5,2025-03-04 08:13:50,2025-03-04 08:16:19,IP Address,72.184.242.227,100,149,True,2025-03-04 08:16:19,R_1dGPImUagL5Ud8G,...,33,4.0,control,,,,1.0,2025-03-04,reminder,2025-03-04 08:16:19


Found 1694 unique podcast IDs in email_two_survey.csv

Total unique podcast IDs across all CSV files: 5105

Filtering JSONL files to include only matching podcast IDs...

Processing prod_db.jsonl...
  Found 100 matching podcasts in this file so far...
  Found 200 matching podcasts in this file so far...
  Found 300 matching podcasts in this file so far...
  Found 400 matching podcasts in this file so far...
  Found 500 matching podcasts in this file so far...
  Found 600 matching podcasts in this file so far...
  Found 700 matching podcasts in this file so far...
  Found 800 matching podcasts in this file so far...
  Found 900 matching podcasts in this file so far...
  Found 1000 matching podcasts in this file so far...
  Found 1100 matching podcasts in this file so far...
  Found 1200 matching podcasts in this file so far...
  Found 1300 matching podcasts in this file so far...
  Found 1400 matching podcasts in this file so far...
  Found 1500 matching podcasts in this file so far...
