In [1]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from oauth2client.tools import argparser
from youtube_transcript_api import YouTubeTranscriptApi
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [2]:
youtube_api_key = 'AIzaSyDeBgEti_EKwuO26al_wLBMRv0miOrt54c'

In [78]:
def build_youtube_search(developer_key):
    DEVELOPER_KEY = developer_key
    YOUTUBE_API_SERVICE_NAME="youtube"
    YOUTUBE_API_VERSION="v3"
    return build(YOUTUBE_API_SERVICE_NAME,YOUTUBE_API_VERSION,developerKey=DEVELOPER_KEY)

def get_search_response(youtube, query):
    search_items = []
    video_items = []
    INIT_REQ = True
    while len(search_items) < 1000:
        if INIT_REQ:
            search_response = youtube.search().list(
                q = query,
                order = "relevance",
                part = "snippet",
                maxResults = 50
                ).execute()
            INIT_REQ = False
        else:
            search_response = youtube.search().list(
                q = query,
                order = "relevance",
                part = "snippet",
                maxResults = 50,
                pageToken = next_page_token
                ).execute()
            
        search_items.extend(search_response['items'])
        
        video_ids = [i.get('id').get('videoId') for i in search_response['items'] if i['id']['kind'] == 'youtube#video']
        video_items.extend(youtube.videos().list(
            part = "id,contentDetails,statistics",
            id = ','.join(video_ids)).execute()['items'])
        
        try:
            next_page_token = search_response['nextPageToken']
        except KeyError:
            print('Last page. {} items collected.'.format(len(search_items)))
            break
                    
    return search_items, video_items

def get_video_info(search_items, video_items):
    result_json = {}
    idx = 0
    for item in search_items:
        if item['id']['kind'] == 'youtube#video':
            vid = item['id']['videoId']
            vitem = [item for item in video_items if item['id'] == vid][0]
            if vitem['contentDetails']['caption']:
                captions = get_captions(item['id']['videoId'])
            if captions:
                result_json[idx] = info_to_dict(item['id']['videoId'], item['snippet']['title'], item['snippet']['description'], vitem['statistics']['viewCount'], vitem['statistics']['likeCount'], captions)
                idx += 1
    return result_json

def info_to_dict(videoId, title, description, viewCount, likeCount, captions):
    result = {
        "videoId": videoId,
        "title": title,
        "description": description,
        "viewCount": viewCount,
        "likeCount": likeCount,
        "captions": captions
    }
    return result

def get_captions(vid):
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(vid)
    except:
        return ''

    for transcript in transcript_list:
        if transcript.language_code == 'en':
            caption_script = ' '.join([t.get('text') for t in transcript.fetch()])
            continue
        if transcript.is_translatable:
            caption_script = ' '.join([t.get('text') for t in transcript.translate('en').fetch()])
        
    return caption_script

In [None]:
youtube_api = build_youtube_search(youtube_api_key)
searches, details = get_search_response(youtube_api, 'iphone 15')

videos = get_video_info(searches, details)

Last page. 656 items collected.


In [None]:
import json
with open('videos.json', 'w', encoding='utf-8') as file:
    json.dump(videos, file, indent="\t")
    
with open('videos.json', 'r') as file:
    data = json.load(file)

In [71]:
len(data)

45

In [73]:
docs = [v.get('captions') for v in videos.values()]

In [74]:
len(docs)

50

In [76]:
docs = [d for d in docs if d]
len(docs)

43

In [80]:
vectorizer_model = CountVectorizer(stop_words=stopwords.words('english') + ['iphone', '15', 'pro', 'apple', 'phone'])
topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=5)
topics, probs = topic_model.fit_transform(docs)

2023-09-28 22:38:18,894 - BERTopic - Transformed documents to Embeddings
2023-09-28 22:38:21,435 - BERTopic - Reduced dimensionality
2023-09-28 22:38:21,441 - BERTopic - Clustered reduced embeddings
2023-09-28 22:38:21,608 - BERTopic - Reduced number of topics from 1 to 1


In [85]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,43,-1_like_new_also_one,"[like, new, also, one, camera, max, even, righ...",[hey guys Anton Tech Chap and I'm filming this...


In [79]:
docs

["today we're going to be durability testing the new aluminum iPhone 15 and the iPhone 15 plus we already know the iPhone 15 Pro Max didn't survive may it Rest In Pieces I get a lot of questions of how I prepare for events like this and how I get enough nourishment for snapping phones in half well huge thanks to huel for sponsoring this video I lost about 15 pounds a year ago and I've kept it off ever since thanks to counting calories in just a few seconds he will black makes eating healthy easy by being exactly 400 calories of nutritionally complete plant-based proteins carbs and 27 essential vitamins and minerals the stuff humans need to survive whether I'm bending aluminum or titanium I can't do it without my vitamins and take a look at all of these good things personally I like the cookies and cream it's my absolute favorite I haven't come across one I haven't liked yet I take that back salted caramel is not on my favorites list but luckily there's eight other flavors to from and t