In [None]:
import numpy as np
import pandas as pd
import json
import os
import json
import datetime
import time
from pprint import pprint
import google_auth_oauthlib.flow
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from youtube_transcript_api import YouTubeTranscriptApi,TranscriptsDisabled,NoTranscriptFound

import re
import concurrent.futures

In [None]:
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'
CLIENT_SECRETS_FILE = 'client_secret.json' 

YOUTUBE_CHANNEL_ID = "UCBJycsmduvYEL83R_U4JriQ"
CSV_NAME = 'mkbhd.csv'

SCOPES = [
    'https://www.googleapis.com/auth/youtube.readonly',
    'https://www.googleapis.com/auth/youtube.force-ssl',
    'https://www.googleapis.com/auth/youtubepartner-channel-audit',
    ]

In [None]:
df = pd.DataFrame(columns=['Id', 'Title', 'Published At', 'Captions', 'View Count', 'Like Count'])

In [None]:
def get_service():
        creds = None
        # The file token.json stores the user's access and refresh tokens, and is
        # created automatically when the authorization flow completes for the first time.
        if os.path.exists('token.json'):
            creds = Credentials.from_authorized_user_file('token.json', SCOPES)
            
        # If there are no (valid) credentials available, let the user log in.
        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(
                    CLIENT_SECRETS_FILE, SCOPES)
                creds = flow.run_local_server(port=0)
            # Save the credentials for the next run
            with open('token.json', 'w') as token:
                token.write(creds.to_json())

        try:
            return build(API_SERVICE_NAME, API_VERSION, credentials = creds)

        except HttpError as err:
            print(err)

In [None]:

def download_captions(item, stats):
    videoId = item['id']['videoId']
    title =  item['snippet']['title']
    publishedAt = item['snippet']['publishedAt']
    
    try:
        # Fetch the transcript
        srt = YouTubeTranscriptApi.get_transcript(videoId,languages=['en', 'en-US'])
        
        obj = {
            'id': videoId,
            'title': title,
            'publishedAt': publishedAt,
            'captions': srt
        }
        return obj, stats
        
    except TranscriptsDisabled:
        print(videoId, publishedAt, title.encode('utf-8'), '-- NO CAPTIONS')
        pass
    except NoTranscriptFound:
        print(videoId, publishedAt, title.encode('utf-8'), '-- NO CAPTIONS')
        pass
    
    return None, None


In [None]:
def saveDataToDF(item, stats):
    if(item is not None):
        df.loc[len(df.index)] = [item['id'], item['title'], item['publishedAt'], item['captions'], stats['viewCount'], stats['likeCount']]

In [None]:
def download_channel_captions(api, df, channelId, cutoff):
        publishedBefore = datetime.datetime.now() + datetime.timedelta(days=1)
        publishedBefore = publishedBefore.isoformat()+'Z'

        while True:
            futures = []
            # Get video results of the channel in batches of 50
            search_results = api.search().list(
                part="snippet",
                type="video",
                channelId=channelId,
                order='date',
                maxResults=50,
                publishedBefore=publishedBefore).execute()
            
            # Iterate through each video to fetch transcript and stats, then add to dataframe
            with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
                for search_result in search_results['items']:

                    # Note the date of publishing of earliest video in this batch, since we will fetch the next batch of videos from before it
                    currentDate=search_result['snippet']['publishedAt']
                    currentDate = datetime.datetime.strptime(currentDate,'%Y-%m-%dT%H:%M:%SZ') - datetime.timedelta(seconds=1)
                    currentDate = currentDate.isoformat()+'Z'
                    dt1 = datetime.datetime.fromisoformat(publishedBefore)
                    dt2 = datetime.datetime.fromisoformat(currentDate)

                    if(dt1 > dt2):
                        publishedBefore=currentDate

                    stats = api.videos().list(
                        id=search_result['id']['videoId'],
                        part="snippet,statistics",
                    ).execute()['items'][0]['statistics']

                    futures.append(executor.submit(download_captions, search_result, stats))

                for future in concurrent.futures.as_completed(futures):
                    try:
                        obj, stats = future.result()

                        # Save the data to the dataframe
                        saveDataToDF(obj, stats)
                    except Exception as exc:
                        print(f"Error fetching data: {exc}")
                        pass
                print(df.shape)

            # If we have reached the cutoff video number, break
            if df.shape[0] > cutoff:
                break
            if not search_results['items']:
                break


In [None]:
def summation(x):
    sum = 0
    for item in x:
        sum += item['duration']
    return sum

In [None]:
def count_words(x):
    cnt = 0
    for item in x:
        cnt += len(item['text'].split())
    return cnt

In [None]:
def get_sentences(x):
    sentences = []
    for item in x:
        sentences.append(item['text'])
    sentences = ' '.join(sentences)
    return sentences

In [None]:
def clean_data():
    df['Video Duration'] = 0
    df['CaptionsDuration'] = 0
    df['WordCount'] = 0
    
    df['Video Duration'] = df['Captions'].apply(lambda x: x[-1]['start'] + x[-1]['duration'] if x else None)
    df['CaptionsDuration'] = df['Captions'].apply(lambda x: summation(x) if x else None)
    df['WordCount'] = df['Captions'].apply(lambda x: count_words(x) if x else None)

    df['Captions'] = df['Captions'].apply(lambda x: get_sentences(x) if x else None)
    df['Captions'] = df['Captions'].apply(lambda x: x.replace('\n',' ') if x else None)
    df['Captions'] = df['Captions'].apply(lambda x: x.replace('-',' ') if x else None)
    df['Captions'] = df['Captions'].apply(lambda x: re.sub(r"\[.*?\]", "", x) if x else None)

    df['Words Per Minute'] = df['WordCount']*60/(df['CaptionsDuration'])
    df['Year'] = df['Published At'].apply(lambda x: int(datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ').year))

In [60]:
if not os.path.exists(CSV_NAME):
    api = get_service()
    download_channel_captions(api, df, YOUTUBE_CHANNEL_ID, 1100)
    clean_data()
    df.to_csv(CSV_NAME)
else:
    df = pd.read_csv(CSV_NAME)

(50, 6)
(100, 6)
(150, 6)
U3JprndeVYU 2021-06-30T17:01:49Z b'Why Everything is an Ultrabook!' -- NO CAPTIONS
ehv3zQAa9zM 2021-04-22T13:00:44Z b'Apple AirTags Unboxing &amp; Demo!' -- NO CAPTIONS
07mIwEa3xbQ 2021-04-26T23:20:48Z b'OnePlus Watch Review: They Settled!' -- NO CAPTIONS
Error fetching data: 'viewCount'
wUqZQTp_gpI 2021-04-11T22:01:55Z b'Top 5 Mercedes EQS Features: Electric Luxury!' -- NO CAPTIONS
(195, 6)
8eNNJESKjrE 2021-03-23T15:01:27Z b'OnePlus 9 Pro Review: A Huge Hasselblad Promise!' -- NO CAPTIONS
_fqxMZi7P7U 2021-01-20T00:01:24Z b'Xiaomi Mi 11: The New Normal!' -- NO CAPTIONS
l4bNwGCx1FA 2020-11-27T14:00:19Z b'The FIRST MKBHD Product: ICONS!' -- NO CAPTIONS
f4g2nPY-VZc 2020-11-17T14:01:00Z b'Apple M1 Mac Review: Time to Recalibrate!' -- NO CAPTIONS
eWI_BtcDJu0 2020-10-30T03:44:44Z b'iPhone 12 Pro Review: You Sure About That?' -- NO CAPTIONS
t0XN-dJftSU 2020-10-16T22:01:26Z b'LG Wing: The Swiveling Smartphone!' -- NO CAPTIONS
(239, 6)
VHt0LqGZVSY 2020-10-14T14:43:50Z 

In [61]:
print(df.head(5))

            Id                                              Title  \
0  GQkkHtBD1BM            Taking 1000 Steps  In Every Smartwatch!   
1  B3szaVzQx0o       Google Pixel 8A Impressions: Just Get The 8!   
2  8reaJG7z-is  I Visited Apple&#39;s Secret iPhone Testing Labs!   
3  B-d97ZrAJZ0          OnePlus 12 Review: Better Than You Think!   
4  CjWfgSrwrz0         Apple Vision Pro Hidden Feature ✈️ #protip   

           Published At                                           Captions  \
0  2024-01-19T23:44:36Z  I'm curious how accurate the step counters are...   
1  2024-05-07T16:18:33Z  (bright music)   All right, so there is just o...   
2  2024-06-07T21:31:09Z  ♪ Ah, ah, ah, yeah, yeah ♪ ♪ Ah, ah, ah, yeah,...   
3  2024-02-09T03:45:25Z  (smooth music)   All right, now that we're bac...   
4  2024-02-09T22:20:23Z  okay so one of the best features of the Vision...   

  View Count Like Count  Video Duration  CaptionsDuration  WordCount  \
0   13451274     515806          61.640     