# Thematic Analysis

In [1]:
import json
import pandas as pd
import numpy as np
from bertopic import BERTopic

from functions import load_json, convert_to_df

In [2]:
# Load json file from data folder
transcripts_4o = load_json('/workspaces/youtube-ad-detection/data/kw_gpt_transcripts_gpt4o.json')
transcripts_4 = load_json('/workspaces/youtube-ad-detection/data/kw_gpt_transcripts_gpt4.json')

ads_4o = load_json('/workspaces/youtube-ad-detection/data/kw_gpt_ads_gpt4o.json')
ads_4 = load_json('/workspaces/youtube-ad-detection/data/kw_gpt_ads_gpt4.json')

## GPT-4o

In [109]:
transcripts_4o_kw_generated = convert_to_df(transcripts_4o)
ads_4o_kw_generated = convert_to_df(ads_4o)

df_4o = pd.merge(transcripts_4o_kw_generated, ads_4o_kw_generated, on=['video_id'], how='left').replace({None: np.nan})

In [83]:
topic_model4o = BERTopic()
topics4o, probs4o = topic_model4o.fit_transform(df_4o['gpt_generated_x'].dropna().str.join(' '))
topic_model_output4o = topic_model4o.get_topic_info()
topic_model_output4o

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,63,-1_quantum_cancer_probability_pregnancy,"[quantum, cancer, probability, pregnancy, japa...","[probability cancer, probability cancer, proba..."
1,0,69,0_architecture_design_development_career,"[architecture, design, development, career, pr...","[skill development architecture career, archit..."
2,1,49,1_universe_space_cosmic_hawking,"[universe, space, cosmic, hawking, time, matte...","[time space reality experience universe, expan..."
3,2,32,2_land_conflict_war_banana,"[land, conflict, war, banana, iceland, canada,...","[georgia russia relations georgia land path, i..."
4,3,18,3_fish_characteristics_food_jellyfish,"[fish, characteristics, food, jellyfish, prefe...","[turtle food preference, turtle food preferenc..."
5,4,17,4_javascript_typescript_rocket_react,"[javascript, typescript, rocket, react, propul...","[rocket types rocket propulsion, rocket types ..."
6,5,12,5_hash_billion_cryptocurrency_magnet,"[hash, billion, cryptocurrency, magnet, vitali...","[billion hash, billion hash, billion hash]"
7,6,12,6_operations_circle_calculation_average,"[operations, circle, calculation, average, orb...","[circle operations, circle operations, circle ..."
8,7,12,7_music_surreal_math_joma,"[music, surreal, math, joma, magnetism, magnet...","[music surreal math, music surreal math, music..."
9,8,11,8_transformations_linear_conceptualize_vector,"[transformations, linear, conceptualize, vecto...",[linear transformations conceptualize transfor...


In [84]:
ads_topic_model4o = BERTopic()
ads_topics4o, ads_probs4o = ads_topic_model4o.fit_transform(df_4o['gpt_generated_y'].dropna().str.join(' '))
ads_topic_model_output4o = ads_topic_model4o.get_topic_info()
ads_topic_model_output4o

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,43,-1_therapy_services_products_shaving,"[therapy, services, products, shaving, therapi...","[therapy, therapy services therapist selection..."
1,0,42,0_learning_education_course_skillshare,"[learning, education, course, skillshare, scie...","[sketching education online learning, data cou..."
2,1,27,1_video_pbs_content_nebula,"[video, pbs, content, nebula, platform, spacet...","[nebula content nebula platform, video content..."
3,2,11,2_rock_subscription_mineral_sign,"[rock, subscription, mineral, sign, membership...","[rock subscription rock sign mineral, rock sub..."


In [86]:
topic_model_output4o.to_excel('data/TopicModel4o.xlsx', index=False)
ads_topic_model_output4o.to_excel('data/TopicModelAds4o.xlsx', index=False)

In [90]:
df_topics = topic_model4o.get_document_info(df_4o['gpt_generated_x'].dropna().str.join(' '))

In [91]:
df_ads = ads_topic_model4o.get_document_info(df_4o['gpt_generated_y'].dropna().str.join(' '))

In [101]:
transcript_mapping = pd.read_excel('data/TopicModel4o consolidated.xlsx', sheet_name='transcript')
ad_mapping = pd.read_excel('data/TopicModel4o consolidated.xlsx', sheet_name='ads')

In [110]:
df_4o['transcript_topics'] = df_4o['gpt_generated_x'].str.join(' ')
df_4o['ad_topics'] = df_4o['gpt_generated_y'].str.join(' ')
df_4o = pd.merge(df_4o, df_topics[['Document', 'Topic']], left_on ='transcript_topics', right_on='Document', how='left')
df_4o = pd.merge(df_4o, df_ads[['Document', 'Topic']], left_on ='ad_topics', right_on='Document', how='left')
df_4o = pd.merge(df_4o, transcript_mapping[['Label', 'Topic']], left_on ='Topic_x', right_on='Topic', how='left', suffixes=('_x1', '_y1'))
df_4o = pd.merge(df_4o, ad_mapping[['Label', 'Topic']], left_on ='Topic_y', right_on='Topic', how='left', suffixes=('_x2', '_y2'))

In [112]:
df_4o.to_excel('data/TopicModel4oLabelled.xlsx', index=False)

In [117]:
print(f"KeyBERT: {len(sum(df_4o['kw_generated_x'].dropna(), []))}")
print(f"ChatGPT: {len(sum(df_4o['gpt_generated_x'].dropna(), []))}")

KeyBERT: 3103
ChatGPT: 1241


In [119]:
print(f"KeyBERT: {len(sum(df_4o['kw_generated_y'].dropna(), []))}")
print(f"ChatGPT: {len(sum(df_4o['gpt_generated_y'].dropna(), []))}")

KeyBERT: 1020
ChatGPT: 377


## GPT-4

In [54]:
transcripts_4_kw_generated = convert_to_df(transcripts_4)
ads_4_kw_generated = convert_to_df(ads_4)

df_4 = pd.merge(transcripts_4_kw_generated, ads_4_kw_generated, on=['video_id'], how='left').replace({None: np.nan})