# Ad Detection

In [1]:
import asyncio
import json
from time import sleep

from openai import AsyncOpenAI

from prompts import find_ad
from keys import openai_key

client = AsyncOpenAI(api_key=openai_key)

In [2]:
# Load json file from data folder
with open('/workspaces/youtube-ad-detection/data/transcripts.json') as f:
    transcripts = json.load(f)

In [3]:
async def find_ads(model, system_prompt, content, type):
    system_msg = [{"role": "system", "content": system_prompt}]

    user_assistant_msgs = [
        {   "role": "user",
            "content": json.dumps(content[type], indent=2)
        }
    ]

    msgs = system_msg + user_assistant_msgs

    response = await client.chat.completions.create(
                model=model,
                messages=msgs,
                temperature=0
                )

    try:
        output = json.loads(response.choices[0].message.content)
        return {
            'videoId': content['videoId'],
            'ad' : output
        }
    except:
        return {
            'videoId': content['videoId'],
            'ad' : str(response.choices[0].message.content)
        }

In [4]:
async def process_all_contents(model, system_prompt, all_content, type, max_concurrent_tasks=10):
    # Create a semaphore to control concurrency
    semaphore = asyncio.Semaphore(max_concurrent_tasks)
    
    # Define an async function to wrap find_ads and include semaphore control
    async def process_single_content(content):
        async with semaphore:
            return await find_ads(model, system_prompt, content, type)
    
    # List to hold the tasks
    tasks = []
    results = []

    for index, value in enumerate(all_content):
        tasks.append(asyncio.create_task(process_single_content(value)))
        
        # If we've reached the batch size or the end of the list, wait for the current batch to finish
        if (index + 1) % max_concurrent_tasks == 0 or (index + 1) == len(all_content):
            batch_results = await asyncio.gather(*tasks)
            results.extend(batch_results)
            tasks.clear()  # Clear the task list for the next batch

            # Introduce delay after processing each batch, except after the final batch
            if (index + 1) < len(all_content):
                await asyncio.sleep(5)

    return results

## gpt-4o-2024-08-06

In [10]:
model = 'gpt-4o-2024-08-06'
ads = {}
for channel in transcripts.keys():
    ads[channel] = {}
    print(f"Processing {channel}")
    for transcript_type in ['manual', 'generated']:
        ads[channel][transcript_type] = await process_all_contents(model, find_ad, [i for i in transcripts[channel] if i], transcript_type)
        sleep(5)

Processing SciShow
Processing Johnny Harris
Processing PBS Space Time
Processing 3Blue1Brown
Processing DamiLee
Processing Fireship


In [12]:
# Save ads to json file
with open('/workspaces/youtube-ad-detection/data/ads_gpt4o.json', 'w') as f:
    json.dump(ads, f, indent=4)

In [None]:
# Save ads to json file
with open('/workspaces/youtube-ad-detection/data/ads_gpt4o_mini.json', 'w') as f:
    json.dump(ads_4o_mini, f, indent=4)

## gpt-4-turbo-2024-04-09

In [6]:
model = 'gpt-4-turbo-2024-04-09'
ads_4 = {}
for channel in transcripts.keys():
    ads_4[channel] = {}
    print(f"Processing {channel}")
    for transcript_type in ['manual', 'generated']:
        ads_4[channel][transcript_type] = await process_all_contents(model, find_ad, [i for i in transcripts[channel] if i], transcript_type)
        sleep(5)

Processing SciShow
Processing Johnny Harris
Processing PBS Space Time
Processing 3Blue1Brown
Processing DamiLee
Processing Fireship


In [7]:
# Save ads to json file
with open('/workspaces/youtube-ad-detection/data/ads_gpt4.json', 'w') as f:
    json.dump(ads_4, f, indent=4)