# Ad Processing

In [67]:
import copy
import json

## Clean Transcripts

In [76]:
# Load Transcripts
with open('/workspaces/youtube-ad-detection/data/transcripts.json') as f:
    transcripts = json.load(f)

In [77]:
# Clean transcripts by removing \xa0\n
cleaned_transcripts = copy.deepcopy(transcripts)
for name, value in transcripts.items():
    for video in value:
        if video:
            for transcript_type in ['generated', 'manual']:
                if type(video[transcript_type]) == list:
                    for item in video[transcript_type]:
                        item['text'] = item['text'].replace('\xa0\n', ' ').replace('\u2019', "'")

## Process Ads
### GPT-4o

In [70]:
# Load Ad Data
with open('/workspaces/youtube-ad-detection/data/ads_gpt4o.json') as f:
    ads = json.load(f)

In [71]:
# Find the ads with ad value as string
string_errors = []
for name, value in ads.items():
    for transcript_type, videos in value.items():
        for video in videos:
            if type(video['ad'])==str and video['ad']!='None':
                string_errors.append(video['ad'])

In [72]:
# Fix errors
cleaned_ads = {}
# Loop through each ad
for name, value in ads.items():
    cleaned_ads[name] = {}
    # Loop through each transcript type
    for transcript_type, videos in value.items():
        cleaned_ads[name][transcript_type] = []
        # Loop through each video
        for video in videos:
            # If the ad is a string and not None
            if type(video['ad'])==str and video['ad']!='None':
                reformatted = video['ad']\
                                    .replace('```', '')\
                                    .replace('json', '')\
                                    .replace('\n', '')\
                                    .replace("'text'", '"text"')\
                                    .replace("'start'", '"start"')\
                                    .replace("'duration'", '"duration"')\
                                    .replace(": '", ': "')\
                                    .replace("', ", '", ')\
                                    .replace('],[', ',')\
                                    .replace('][', ',')\
                                    .replace('\xa0\n', ' ')\
                                    .replace('\u2019', "'")
                try:
                    cleaned_ads[name][transcript_type].append(
                        {   
                            'videoId': video['videoId'],
                            'ad': json.loads(reformatted)
                        }
                    )
                except Exception as e:
                    print(e)
                    print(reformatted)
            # If the ad is a string and is None
            elif type(video['ad'])==str and video['ad']=='None':
                pass
            # If the ad is a list
            else:
                cleaned_ads[name][transcript_type].append(
                    {   
                        'videoId': video['videoId'],
                        'ad': video['ad']
                    }
                )

In [73]:
# Reshape
reshaped_ads = {}
# Loop through each ad
for name, value in cleaned_ads.items():
    reshaped_ads[name] = []
    # Loop through each transcript type
    for transcript_type, videos in value.items():
        # Loop through each video
        for video in videos:
            # Check if dictionary with videoId exists
            if video['videoId'] not in [x['videoId'] for x in reshaped_ads[name]]:
                # If the video ID does not exist, add it
                reshaped_ads[name].append(
                    {
                        'videoId': video['videoId'],
                        transcript_type: video['ad']
                    }
                )
                # else add the transcript to the dictionary
            else:
                for x in reshaped_ads[name]:
                    if x['videoId']==video['videoId']:
                        x[transcript_type] = video['ad']

# Ensure manual and generated keys are in each dictionary
for name, value in reshaped_ads.items():
    for video in value:
        if 'manual' not in video.keys():
            video['manual'] = None
        if 'generated' not in video.keys():
            video['generated'] = None

In [74]:
# For each transcript, check if it contains an ad by checking if the dictionary value from the ad data is in the transcript dictionaries
processed_transcripts = {}
# Loop through each channel
for name, value in cleaned_transcripts.items():
    processed_transcripts[name] = []
    # Loop through each video
    for video in value:
        if video:
            data = {
                'metadata': {
                    'videoId': video['videoId'],
                    'contains_ad': False
                },
                'generated': video['generated'],
                'manual': video['manual'],
            }
            # Find the corresponding video in the reshaped ads
            for ad_video in reshaped_ads[name]:
                if video['videoId']==ad_video['videoId']:
                    # Loop through generated and manual transcripts
                    for transcript_type in ['generated', 'manual']:
                        if data[transcript_type] is not None and ad_video[transcript_type] is not None:
                            # Loop through items in generated or manual transcript
                            for item in data[transcript_type]:
                                # Check if the item is in the ad dictionary
                                if item['text'] in [i['text'] for i in ad_video[transcript_type]]:
                                    item['ad'] = True
                                    data['metadata']['contains_ad'] = True
                                else:
                                    item['ad'] = False
            processed_transcripts[name].append(data)

In [75]:
for name in ['SciShow', 'Johnny Harris', 'PBS Space Time', '3Blue1Brown', 'DamiLee', 'Fireship']:
    n_generated = len([i['generated'] for i in processed_transcripts[name] if i and i.get('generated')])
    n_manual = len([i['manual'] for i in processed_transcripts[name] if i and i.get('manual')])
    n_generated_ads = len(cleaned_ads[name]['generated'])
    n_manual_ads = len(cleaned_ads[name]['manual'])
    if n_generated == 0:
        p_generated = 0
    else:
        p_generated = n_generated_ads/n_generated
    if n_manual == 0:
        p_manual = 0
    else:
        p_manual = n_manual_ads/n_manual
    diff = abs(n_generated_ads - n_manual_ads)
    print(f"{name}: Generated = {n_generated_ads}/{n_generated} ({p_generated:.2%}) Manual = {n_manual_ads}/{n_manual} ({p_manual:.2%}) Difference = {diff}")

SciShow: Generated = 23/47 (48.94%) Manual = 23/28 (82.14%) Difference = 0
Johnny Harris: Generated = 42/48 (87.50%) Manual = 41/44 (93.18%) Difference = 1
PBS Space Time: Generated = 20/44 (45.45%) Manual = 27/48 (56.25%) Difference = 7
3Blue1Brown: Generated = 0/9 (0.00%) Manual = 3/49 (6.12%) Difference = 3
DamiLee: Generated = 14/48 (29.17%) Manual = 7/9 (77.78%) Difference = 7
Fireship: Generated = 10/47 (21.28%) Manual = 0/0 (0.00%) Difference = 10


In [35]:
# Save processed to json file
with open('/workspaces/youtube-ad-detection/data/processed_transcripts_gpt4o.json', 'w') as f:
    json.dump(processed_transcripts, f, indent=4)
with open('/workspaces/youtube-ad-detection/data/processed_ads_gpt4o.json', 'w') as f:
    json.dump(reshaped_ads, f, indent=4)

### GPT-4

In [78]:
# Load Ad Data
with open('/workspaces/youtube-ad-detection/data/ads_gpt4.json') as f:
    ads = json.load(f)

In [79]:
# Find the ads with ad value as string
string_errors = []
for name, value in ads.items():
    for transcript_type, videos in value.items():
        for video in videos:
            if type(video['ad'])==str and video['ad']!='None':
                string_errors.append(video['ad'])

In [80]:
# Fix errors
cleaned_ads = {}
# Loop through each ad
for name, value in ads.items():
    cleaned_ads[name] = {}
    # Loop through each transcript type
    for transcript_type, videos in value.items():
        cleaned_ads[name][transcript_type] = []
        # Loop through each video
        for video in videos:
            # If the ad is a string and not None
            if type(video['ad'])==str and video['ad']!='None':
                reformatted = video['ad']\
                                    .replace('```', '')\
                                    .replace('json', '')\
                                    .replace('\n', '')\
                                    .replace("'text'", '"text"')\
                                    .replace("'start'", '"start"')\
                                    .replace("'duration'", '"duration"')\
                                    .replace(": '", ': "')\
                                    .replace("', ", '", ')\
                                    .replace('],[', ',')\
                                    .replace('][', ',')\
                                    .replace('\xa0\n', ' ')\
                                    .replace('\u2019', "'")
                try:
                    if video['videoId'] in ['15MaSayc28c', 'lkIFF4maKMU']:
                        pass
                    else:
                        cleaned_ads[name][transcript_type].append(
                            {   
                                'videoId': video['videoId'],
                                'ad': json.loads(reformatted)
                            }
                        )
                except Exception as e:
                    print(e)
                    print(f"{name} - {transcript_type} - {video['videoId']}")
                    print(reformatted)
            # If the ad is a string and is None
            elif type(video['ad'])==str and video['ad']=='None':
                pass
            # If the ad is a list
            else:
                cleaned_ads[name][transcript_type].append(
                    {   
                        'videoId': video['videoId'],
                        'ad': video['ad']
                    }
                )

In [81]:
# Reshape
reshaped_ads = {}
# Loop through each ad
for name, value in cleaned_ads.items():
    reshaped_ads[name] = []
    # Loop through each transcript type
    for transcript_type, videos in value.items():
        # Loop through each video
        for video in videos:
            # Check if dictionary with videoId exists
            if video['videoId'] not in [x['videoId'] for x in reshaped_ads[name]]:
                # If the video ID does not exist, add it
                reshaped_ads[name].append(
                    {
                        'videoId': video['videoId'],
                        transcript_type: video['ad']
                    }
                )
                # else add the transcript to the dictionary
            else:
                for x in reshaped_ads[name]:
                    if x['videoId']==video['videoId']:
                        x[transcript_type] = video['ad']

# Ensure manual and generated keys are in each dictionary
for name, value in reshaped_ads.items():
    for video in value:
        if 'manual' not in video.keys():
            video['manual'] = None
        if 'generated' not in video.keys():
            video['generated'] = None

In [82]:
# For each transcript, check if it contains an ad by checking if the dictionary value from the ad data is in the transcript dictionaries
processed_transcripts = {}
# Loop through each channel
for name, value in cleaned_transcripts.items():
    processed_transcripts[name] = []
    # Loop through each video
    for video in value:
        if video:
            data = {
                'metadata': {
                    'videoId': video['videoId'],
                    'contains_ad': False
                },
                'generated': video['generated'],
                'manual': video['manual'],
            }
            # Find the corresponding video in the reshaped ads
            for ad_video in reshaped_ads[name]:
                if video['videoId']==ad_video['videoId']:
                    # Loop through generated and manual transcripts
                    for transcript_type in ['generated', 'manual']:
                        if data[transcript_type] is not None and ad_video[transcript_type] is not None:
                            # Loop through items in generated or manual transcript
                            for item in data[transcript_type]:
                                # Check if the item is in the ad dictionary
                                if item['text'] in [i['text'] for i in ad_video[transcript_type]]:
                                    item['ad'] = True
                                    data['metadata']['contains_ad'] = True
                                else:
                                    item['ad'] = False
            processed_transcripts[name].append(data)

In [83]:
for name in ['SciShow', 'Johnny Harris', 'PBS Space Time', '3Blue1Brown', 'DamiLee', 'Fireship']:
    n_generated = len([i['generated'] for i in processed_transcripts[name] if i and i.get('generated')])
    n_manual = len([i['manual'] for i in processed_transcripts[name] if i and i.get('manual')])
    n_generated_ads = len(cleaned_ads[name]['generated'])
    n_manual_ads = len(cleaned_ads[name]['manual'])
    if n_generated == 0:
        p_generated = 0
    else:
        p_generated = n_generated_ads/n_generated
    if n_manual == 0:
        p_manual = 0
    else:
        p_manual = n_manual_ads/n_manual
    diff = abs(n_generated_ads - n_manual_ads)
    print(f"{name}: Generated = {n_generated_ads}/{n_generated} ({p_generated:.2%}) Manual = {n_manual_ads}/{n_manual} ({p_manual:.2%}) Difference = {diff}")

SciShow: Generated = 24/47 (51.06%) Manual = 25/28 (89.29%) Difference = 1
Johnny Harris: Generated = 42/48 (87.50%) Manual = 42/44 (95.45%) Difference = 0
PBS Space Time: Generated = 26/44 (59.09%) Manual = 28/48 (58.33%) Difference = 2
3Blue1Brown: Generated = 1/9 (11.11%) Manual = 7/49 (14.29%) Difference = 6
DamiLee: Generated = 16/48 (33.33%) Manual = 7/9 (77.78%) Difference = 9
Fireship: Generated = 18/47 (38.30%) Manual = 0/0 (0.00%) Difference = 18


In [45]:
# Save processed to json file
with open('/workspaces/youtube-ad-detection/data/processed_transcripts_gpt4.json', 'w') as f:
    json.dump(processed_transcripts, f, indent=4)
with open('/workspaces/youtube-ad-detection/data/processed_ads_gpt4.json', 'w') as f:
    json.dump(reshaped_ads, f, indent=4)