In [1]:
from tqdm.auto import tqdm
import json

# Read input data

In [2]:
file_name = 'podcasts_25k.json' #all data, 14700 podcasts after clearning
#file_name = 'podcasts.json' #sample data, 120 podcasts
with open(f'../data/processed/{file_name}', 'rt') as f_in:
   podcasts = json.load(f_in)

# Generate queries with OpenAI 

In [3]:
prompt_template = """
You emulate a user who's looking for a podcasts. 
Formulate 5 search queries this user might ask based on a podcast description to find the podcast. 
Use as fewer words as possible from the name. 
Make the query with at least 7 words and be very specific. Do NOT use word "podcast"

The record:

name: {name}
description: {description}
category: {category}

Provide the output in parsable JSON without using code blocks:

["query1", "query2", ..., "query5"]
""".strip()

In [4]:
import os
from openai import OpenAI
client = OpenAI()

In [5]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [6]:
from tqdm.auto import tqdm

In [7]:
results = {}
for doc in tqdm(podcasts[14300:14500]): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/200 [00:00<?, ?it/s]

In [8]:
#results

In [9]:
#!head '../data/processed/podcasts.json'

In [22]:
podcasts[4]

{'id': 'id1350850605',
 'name': 'Dressed: The History of Fashion',
 'url': 'https://podcasts.apple.com/us/podcast/dressed-the-history-of-fashion/id1350850605',
 'studio': 'Dressed Media',
 'category': 'Arts',
 'episode_count': 503,
 'avg_rating': 4.5,
 'total_ratings': 1400,
 'description': 'With over 8 billion people in the world, we all have one thing in common. Every day we all get dressed. Join Dressed as we explore the social and cultural histories behind the who, what, when of why we wear.'}

In [11]:
#results['id1550677342']

In [12]:
# import 
# with open('results.bin', 'rb') as f_in:
#     results = pickle.load(f_in)

In [13]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

In [14]:
doc_index = {d['id']: d for d in podcasts}

In [15]:
final_results = []

for doc_id, questions in parsed_results.items():
    category = doc_index[doc_id]['category']
    for q in questions:
        final_results.append((q, category, doc_id))

In [16]:
import pandas as pd
df = pd.DataFrame(final_results, columns=['query', 'category', 'podcast_id'])
df.to_csv('../data/processed/eval_ground-truth-data_1000q_c.csv', index=False)

In [17]:
df.sample(10)

Unnamed: 0,query,category,podcast_id
326,favorite reality television coverage by Michae...,TV & Film,id1684625662
31,chronological exploration of Bluey by two dads,TV & Film,id1586223060
299,Post-show recap and analysis of Seinfeld series,TV & Film,id887608146
891,Exploring AI advancements with Lyft and others,Technology,id1504567418
410,imaginary discussions advancing moral and civi...,Technology,id1614038329
318,analysis of new and old SNL episodes,TV & Film,id1663042304
652,Carole Theriault talks about cybersecurity wit...,Technology,id1195001633
582,beginner-friendly guides to cryptocurrencies a...,Technology,id1515133543
764,encouragement towards privacy and data soverei...,Technology,id1572450110
97,Below Deck Sailing Yacht episode analysis,TV & Film,id1583787093


In [18]:
df['category'].unique()

array(['TV & Film', 'Technology'], dtype=object)

Generating 5 queries for 200 podcasts took 5 min and 0.32$ -> 3 cents for 100 ground-truth query