# Keyword Identification

In [1]:
import json
from keybert import KeyBERT

from functions import load_json, combine_transcripts

  from tqdm.autonotebook import tqdm, trange


## Load Processed Data

In [2]:
# Load json file from data folder
transcripts_4o = load_json('/workspaces/youtube-ad-detection/data/processed_transcripts_gpt4o.json')
transcripts_4 = load_json('/workspaces/youtube-ad-detection/data/processed_transcripts_gpt4.json')

ads_4o = load_json('/workspaces/youtube-ad-detection/data/processed_ads_gpt4o.json')
ads_4 = load_json('/workspaces/youtube-ad-detection/data/processed_ads_gpt4.json')

## Combine Transcripts

In [19]:
# Transcripts
combined_transcripts_4o = combine_transcripts(transcripts_4o, print_stats=True)
combined_transcripts_4 = combine_transcripts(transcripts_4)


SciShow: Generated = 47 Manual = 28
Johnny Harris: Generated = 48 Manual = 44
PBS Space Time: Generated = 44 Manual = 48
3Blue1Brown: Generated = 9 Manual = 49
DamiLee: Generated = 48 Manual = 9
Fireship: Generated = 47 Manual = 0


In [20]:
# Ads
combined_ads_4o = combine_transcripts(ads_4o, print_stats=True)
combined_ads_4 = combine_transcripts(ads_4)

SciShow: Generated = 23 Manual = 23
Johnny Harris: Generated = 42 Manual = 41
PBS Space Time: Generated = 20 Manual = 27
3Blue1Brown: Generated = 0 Manual = 3
DamiLee: Generated = 14 Manual = 7
Fireship: Generated = 10 Manual = 0


In [21]:
combined_ads_4o['Fireship'][9]['generated']

"Jeet brain idees sponsor today 's video legendary family integrated development environment used million developer every day famous sophisticated code completion integrated tooling web developer 'm big fan webstorm contains everything 'll need craziness JavaScript development understand write code even faster thanks AI assistant 's programming optimized model offer suggestion describe code write code understanding context large project importantly IDE reliable fun use 's got intuitive minimal interface ton power hood n't need install bunch plugins start productive best try Jeet brain 's idees favorite programming language 30-day free trial use link description deson let 's talk"

## Find Keywords

In [22]:
# Create a KeyBERT model
kw_model = KeyBERT()

In [34]:
def find_kw(data, ngram_range = (1,2), topn = 5):
    kw_data = {}
    for channel, value in data.items():
        kw_data[channel] = []
        for video in value:
            if video and video['manual']:
                kw_manual = kw_model.extract_keywords(video['manual'], keyphrase_ngram_range=ngram_range, top_n=topn, stop_words='english')
                l_manual = [i[0] for i in kw_manual]
            else:
                l_manual = None
            if video and video['generated']:
                kw_generated = kw_model.extract_keywords(video['generated'], keyphrase_ngram_range=ngram_range, top_n=topn, stop_words='english')
                l_generated = [i[0] for i in kw_generated]
            if 'metadata' in video.keys():
                metadata = video['metadata']
                metadata['kw'] = {
                    'generated': l_generated,
                    'manual': l_manual
                }
            else:
                metadata = {
                    'videoId': video['videoId'],
                    'kw': {
                        'generated': l_generated,
                        'manual': l_manual
                    }
                }
            kw_data[channel].append(
                {
                    'metadata': metadata,
                    'manual': video['manual'],
                    'generated': video['generated']
                }
            )
    return kw_data

In [35]:
# Transcripts
kw_transcripts_4o = find_kw(combined_transcripts_4o)
kw_transcripts_4 = find_kw(combined_transcripts_4)

In [37]:
# Ads
kw_ads_4o = find_kw(combined_ads_4o)
kw_ads_4 = find_kw(combined_ads_4)

In [38]:
# Save processed to json file
with open('/workspaces/youtube-ad-detection/data/kw_transcripts_gpt4o.json', 'w') as f:
    json.dump(kw_transcripts_4o, f, indent=4)
with open('/workspaces/youtube-ad-detection/data/kw_ads_gpt4o.json', 'w') as f:
    json.dump(kw_ads_4o, f, indent=4)

In [39]:
# Save processed to json file
with open('/workspaces/youtube-ad-detection/data/kw_transcripts_gpt4.json', 'w') as f:
    json.dump(kw_transcripts_4, f, indent=4)
with open('/workspaces/youtube-ad-detection/data/kw_ads_gpt4.json', 'w') as f:
    json.dump(kw_ads_4, f, indent=4)