# Keyword Grouping

In [20]:
import ast
import asyncio
import copy
import json
from time import sleep

from openai import AsyncOpenAI

from functions import load_json
from prompts import group_kw
from keys import openai_key

client = AsyncOpenAI(api_key=openai_key)

In [2]:
# Load json file from data folder
transcripts_4o = load_json('/workspaces/youtube-ad-detection/data/kw_transcripts_gpt4o.json')
transcripts_4 = load_json('/workspaces/youtube-ad-detection/data/kw_transcripts_gpt4.json')

ads_4o = load_json('/workspaces/youtube-ad-detection/data/kw_ads_gpt4o.json')
ads_4 = load_json('/workspaces/youtube-ad-detection/data/kw_ads_gpt4.json')

In [4]:
async def group_keywords(model, system_prompt, content, type):
    system_msg = [{"role": "system", "content": system_prompt}]

    user_assistant_msgs = [
        {   "role": "user",
            "content": repr(content['metadata']['kw'][type])
        }
    ]

    msgs = system_msg + user_assistant_msgs

    response = await client.chat.completions.create(
                model=model,
                messages=msgs,
                temperature=0
                )
    
    return {
        'videoId': content['metadata']['videoId'],
        'kw' : str(response.choices[0].message.content)
    }

In [5]:
async def process_all_contents(model, system_prompt, all_content, type, max_concurrent_tasks=10):
    # Create a semaphore to control concurrency
    semaphore = asyncio.Semaphore(max_concurrent_tasks)
    
    # Define an async function to wrap find_ads and include semaphore control
    async def process_single_content(content):
        async with semaphore:
            return await group_keywords(model, system_prompt, content, type)
    
    # List to hold the tasks
    tasks = []
    results = []

    for index, value in enumerate(all_content):
        tasks.append(asyncio.create_task(process_single_content(value)))
        
        # If we've reached the batch size or the end of the list, wait for the current batch to finish
        if (index + 1) % max_concurrent_tasks == 0 or (index + 1) == len(all_content):
            batch_results = await asyncio.gather(*tasks)
            results.extend(batch_results)
            tasks.clear()  # Clear the task list for the next batch

            # Introduce delay after processing each batch, except after the final batch
            if (index + 1) < len(all_content):
                await asyncio.sleep(5)

    return results

In [12]:
async def process_all_contents_loop(data, model, system_prompt):
    output = copy.deepcopy(data)
    for channel, videos in output.items():
        print(f"Processing {channel}")
        for transcript_type in ['manual', 'generated']:
            results = await process_all_contents(model, system_prompt, videos, transcript_type)
            for i in results:
                # find the matching dictionary in output
                for video in output[channel]:
                    if video['metadata']['videoId'] == i['videoId']:
                        video['metadata']['gpt'] = {
                            'generated': '',
                            'manual': ''
                        }
                        video['metadata']['gpt'][transcript_type] = i['kw']
    return output

## gpt-4o-2024-08-06

In [13]:
model = 'gpt-4o-2024-08-06'
kw_gpt_transcripts_gpt4o = await process_all_contents_loop(transcripts_4o, model, group_kw)
kw_gpt_ads_gpt4o = await process_all_contents_loop(ads_4o, model, group_kw)

Processing SciShow
Processing Johnny Harris
Processing PBS Space Time
Processing 3Blue1Brown
Processing DamiLee
Processing Fireship
Processing SciShow
Processing Johnny Harris
Processing PBS Space Time
Processing 3Blue1Brown
Processing DamiLee
Processing Fireship


  kw_gpt_transcripts_gpt4 = await process_all_contents_loop(transcripts_4, model, group_kw)


In [21]:
ori_transcripts_4o = copy.deepcopy(kw_gpt_transcripts_gpt4o)
ori_ads_4o = copy.deepcopy(kw_gpt_ads_gpt4o)

In [43]:
kw_gpt_transcripts_gpt4o = copy.deepcopy(ori_transcripts_4o)
kw_gpt_ads_gpt4o = copy.deepcopy(ori_ads_4o)

In [44]:
# Convert list representation
for data in [kw_gpt_transcripts_gpt4o, kw_gpt_ads_gpt4o]:
    for channel, videos in data.items():
        for video in videos:
            for transcript_type in ['manual', 'generated']:
                if video['metadata']['gpt'][transcript_type] and type(video['metadata']['gpt'][transcript_type])==str:
                    try:
                        reformat_list = video['metadata']['gpt'][transcript_type].replace('_', ' ')
                        video['metadata']['gpt'][transcript_type] = list(set(ast.literal_eval(reformat_list)))
                    except:
                        elements = video['metadata']['gpt'][transcript_type]\
                                    .replace('_', ' ')\
                                    .replace("'", '')\
                                    .strip("[]").split(", ")
                        quoted_elements = [f'"{element}"' for element in elements]
                        reformat_list = "[" + ", ".join(quoted_elements) + "]"
                        video['metadata']['gpt'][transcript_type] = list(set(ast.literal_eval(reformat_list)))
                else:
                    video['metadata']['gpt'][transcript_type] = None

In [41]:
# Save processed to json file
with open('/workspaces/youtube-ad-detection/data/kw_gpt_transcripts_gpt4o.json', 'w') as f:
    json.dump(kw_gpt_transcripts_gpt4o, f, indent=4)

with open('/workspaces/youtube-ad-detection/data/kw_gpt_ads_gpt4o.json', 'w') as f:
    json.dump(kw_gpt_ads_gpt4o, f, indent=4)

## gpt-4-turbo-2024-04-09

In [45]:
model = 'gpt-4o-2024-08-06'
kw_gpt_transcripts_gpt4 = await process_all_contents_loop(transcripts_4, model, group_kw)
kw_gpt_ads_gpt4 = await process_all_contents_loop(ads_4, model, group_kw)

Processing SciShow
Processing Johnny Harris
Processing PBS Space Time
Processing 3Blue1Brown
Processing DamiLee
Processing Fireship
Processing SciShow
Processing Johnny Harris
Processing PBS Space Time
Processing 3Blue1Brown
Processing DamiLee
Processing Fireship


In [46]:
# Convert list representation
for data in [kw_gpt_transcripts_gpt4, kw_gpt_ads_gpt4]:
    for channel, videos in data.items():
        for video in videos:
            for transcript_type in ['manual', 'generated']:
                if video['metadata']['gpt'][transcript_type] and type(video['metadata']['gpt'][transcript_type])==str:
                    try:
                        reformat_list = video['metadata']['gpt'][transcript_type].replace('_', ' ')
                        video['metadata']['gpt'][transcript_type] = ast.literal_eval(reformat_list)
                    except:
                        elements = video['metadata']['gpt'][transcript_type]\
                                    .replace('_', ' ')\
                                    .replace("'", '')\
                                    .strip("[]").split(", ")
                        quoted_elements = [f'"{element}"' for element in elements]
                        reformat_list = "[" + ", ".join(quoted_elements) + "]"
                        video['metadata']['gpt'][transcript_type] = ast.literal_eval(reformat_list)
                else:
                    video['metadata']['gpt'][transcript_type] = None

In [47]:
# Save processed to json file
with open('/workspaces/youtube-ad-detection/data/kw_gpt_transcripts_gpt4.json', 'w') as f:
    json.dump(kw_gpt_transcripts_gpt4, f, indent=4)

with open('/workspaces/youtube-ad-detection/data/kw_gpt_ads_gpt4.json', 'w') as f:
    json.dump(kw_gpt_ads_gpt4, f, indent=4)