In [1]:
from tqdm.auto import tqdm
import json

# Read input data

In [2]:
file_name = 'podcasts_25k.json' #all data, 14700 podcasts after clearning
#file_name = 'podcasts.json' #sample data, 120 podcasts
with open(f'../data/processed/{file_name}', 'rt') as f_in:
   podcasts = json.load(f_in)

# Generate queries with OpenAI 

In [3]:
prompt_template = """
You emulate a user who's looking at the podcasts description. 
Formulate 5 search queries this user might ask based on a podcast description to find the podcast. 
If possible, use as fewer words as possible from the record. 

The record:

name: {name}
description: {description}
category: {category}

Provide the output in parsable JSON without using code blocks:

["query1", "query2", ..., "query5"]
""".strip()

In [4]:
import os
from openai import OpenAI
client = OpenAI()

In [5]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [6]:
from tqdm.auto import tqdm

In [7]:
results = {}
for doc in tqdm(podcasts): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/14700 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
#!head '../data/processed/podcasts.json'

In [8]:
podcasts[13]

{'id': 'id1550677342',
 'name': 'Create Your Own Light',
 'url': 'https://podcasts.apple.com/us/podcast/create-your-own-light/id1550677342',
 'studio': 'Podcast with Travis Howze',
 'category': 'Arts',
 'episode_count': 129,
 'avg_rating': 4.8,
 'total_ratings': 161,
 'description': 'We all want to live a life we desire and deserve versus merely existing. Each week, join Travis Howze, U.S. Marine, former Police Officer and Firefighter, World Touring Stand Up Comedian, Motivational Speaker and Best Selling Author as he draws from a lifetime of experiences from unbelievably hysterical stories to unspeakable traumatic events and engages directly with his supporters and listeners, taking off-the-cuff questions and topic suggestions to produce a unique broadcast atmosphere, where you, the listener, has a say in the show. If you struggle with purpose, looking for inspiration, have a friend or loved one who could use support, or simply want to laugh and cry in your vehicle, couch, or go-to pl

In [9]:
results['id1550677342']

'["Create Your Own Light podcast", "Travis Howze Marine podcast", "motivational speaker firefigher podcast", "stand up comedian podcast inspiration", "podcast for laughing and crying"]'

In [None]:
# import 
# with open('results.bin', 'rb') as f_in:
#     results = pickle.load(f_in)

In [10]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

In [11]:
doc_index = {d['id']: d for d in podcasts}

In [12]:
final_results = []

for doc_id, questions in parsed_results.items():
    category = doc_index[doc_id]['category']
    for q in questions:
        final_results.append((q, category, doc_id))

In [13]:
import pandas as pd
df = pd.DataFrame(final_results, columns=['query', 'category', 'podcast_id'])
df.to_csv('../data/processed/eval_ground-truth-data_1500.csv', index=False)

In [14]:
df.sample(10)

Unnamed: 0,query,category,podcast_id
4905,tools tips tricks marketers podcast,Business,id1360406217
6660,team ripped vlog audio,Business,id1504787005
4926,Animal Spirits markets life investing,Business,id1310192007
6951,extra income podcast,Business,id1686576891
4202,Art What If podcast Allan Schwartzman,Arts,id1203133627
828,苹果播客年度推荐 2020,Arts,id1504037049
5795,business tips hiring firing marketing,Business,id1460051721
1773,sustainable packaging podcast,Arts,id1511993897
6700,mentors for women in business,Business,id1525261018
3586,54 Below Podcast behind the scenes,Arts,id1491867157


bugdet before 4.65$
budged after 2.36$

Generating 5 queries for 14.7k podcasts took 22 min and 2.29$ -> 3 cents for one ground-truth query