In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np
from dateutil import parser
import isodate

# Google API
from googleapiclient.discovery import build

In [2]:
api_key = 'AIzaSyBhoEbvc5gPclcRqj4_k7DyW5xR3_nsGiQ'

channel_ids = ['UCLXo7UDZvByw2ixzpQCufnA', #VOX
              'UCCu5wtZ5uOWZp_roz7wHPfg', #Climate Adam
               'UCFH5dQAkGIqzcFYmM4tNtXw' #Now This Earth
              ]

youtube = build('youtube', 'v3', developerKey=api_key)

In [3]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    
    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
            
    return pd.DataFrame(all_video_info)

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     


In [4]:
channel_data = get_channel_stats(youtube, channel_ids)

In [5]:
channel_data

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,Vox,11600000,3343326311,1650,UULXo7UDZvByw2ixzpQCufnA
1,ClimateAdam,39300,1420910,160,UUCu5wtZ5uOWZp_roz7wHPfg
2,NowThis Earth,213000,42243801,1873,UUFH5dQAkGIqzcFYmM4tNtXw


In [6]:
import pandas as pd

video_df = pd.DataFrame()  # Initialize an empty DataFrame for video data
comments_df = pd.DataFrame()  # Initialize an empty DataFrame for comments data

for c in channel_data['channelName'].unique():
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName'] == c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
    
    # get video data
    video_data = get_video_details(youtube, video_ids)
    # get comment data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # Append video data and comment data to the respective DataFrames
    video_df = pd.concat([video_df, pd.DataFrame(video_data)], ignore_index=True)
    comments_df = pd.concat([comments_df, pd.DataFrame(comments_data)], ignore_index=True)


Getting video information from channel: Vox
Could not get comments for video V4j606F6mvo
Could not get comments for video QkD8wsiB-6c
Could not get comments for video R2karaKGgkk
Could not get comments for video yeaQUhAOdtk
Could not get comments for video 12HDvYRYp9w
Could not get comments for video R63DdEe_8aM
Could not get comments for video UpmwhkNg5Dw
Could not get comments for video 8_NITp2JgvU
Could not get comments for video 80hc9dV5St0
Could not get comments for video STff_wOQHn4
Could not get comments for video wAQyHqm9STo
Could not get comments for video e9OsIj32w7U
Could not get comments for video o_-1GRDRPfU
Could not get comments for video 5Z101RchIBA
Could not get comments for video sxHu46YKnZg
Could not get comments for video UKlyYvJJF5s
Could not get comments for video TXjU4l5wrkk
Could not get comments for video 9kaSKoBb7ew
Could not get comments for video ZdFFL9wNsaY
Could not get comments for video -_fqI0JNU6g
Could not get comments for video ot3dT7c7jiE
Could not g

HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=UUCu5wtZ5uOWZp_roz7wHPfg&maxResults=50&key=AIzaSyBhoEbvc5gPclcRqj4_k7DyW5xR3_nsGiQ&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">

In [7]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,UWhuHiL8Pug,Vox,Why everyone hates this concrete building,And why brutalism dominates US college campuse...,"[vox.com, vox, explain, brutalism, le corbusie...",2023-11-06T13:00:31Z,61121,3168,,303,PT8M3S,hd,true
1,z-eC9eoaRm4,Vox,How China is designing flood-resistant cities ...,These “sponge city” designs resist floods and ...,"[vox.com, vox, explain, climate, flooding, spo...",2023-10-31T15:49:59Z,366119,17032,,326,PT58S,hd,false
2,CoBeQzc4vQc,Vox,How The Conjuring became the Marvel of horror,The Warrens' case files have helped create a s...,"[Amityville Horror, Conjuring cinematic univer...",2023-10-31T12:00:14Z,152555,6427,,223,PT8M41S,hd,true
3,lmVakNtCYwQ,Vox,Why this 13th-century song shows up everywhere...,Linked to our full video to learn more!,"[vox.com, vox, explain, creepy melody, dies ir...",2023-10-30T18:00:48Z,184288,12327,,192,PT1M,hd,true
4,lv1SpwwJEW8,Vox,"Gaza, explained",Why Palestinians in Gaza have suffered for dec...,"[Gaza Strip, Palestinians, Vox.com, West Bank,...",2023-10-28T01:32:40Z,2731939,83514,,11998,PT15M51S,hd,true
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,UwKZVPYssOg,Vox,Stop freaking out about the debt,Why the national debt isn't as scary as you th...,"[public debt, vox, conversations, series, expl...",2014-03-26T17:47:02Z,230225,2087,,661,PT1M36S,hd,true
1641,m6C8soxbN2w,Vox,How is the world getting better? Charles Kenny...,Senior Fellow at the Center for Global Develop...,"[explainer, global development, vox explains, ...",2014-03-24T22:43:43Z,29057,456,,28,PT5M49S,hd,false
1642,eA151sBcCag,Vox,Is another mass extinction on the horizon?,There have been five mass extinctions in Earth...,"[us, ezra klein, international, gas, whale, vi...",2014-03-20T20:43:17Z,178809,2115,,223,PT18M15S,hd,false
1643,D8n8gYVdThg,Vox,How Obamacare's individual mandate works,Vox explains how does the individual mandate w...,"[current events, president obama, Individual M...",2014-03-19T16:23:52Z,165278,1746,,286,PT2M12S,hd,true


In [8]:
comments_df

Unnamed: 0,video_id,comments
0,UWhuHiL8Pug,[Architecture is fascinating .. everything in ...
1,z-eC9eoaRm4,[Interesting design.. No wonder china is flood...
2,CoBeQzc4vQc,[The filmmakers behind The Conjuring really le...
3,lmVakNtCYwQ,"[Making christmas, 🎵 di-es i-rae 🎵\n\n\n\n\nMa..."
4,lv1SpwwJEW8,[For additional context on the UN’s partition ...
...,...,...
59,orakE9t1tpo,"[There is one, slightly fantastical way the US..."
60,Ml-ZP-_e_o4,[One other kind of building with obvious cladd...
61,0o6ezu_h6iE,"[The image you see at 1:24 of a real, live “Sm..."
62,xXpB9FNmEOs,"[And the winner is..... VICE + HBO!!!!, So, no..."


In [12]:
# Write video data to CSV file for future references
video_df.to_csv('video_data_top10_channels.csv')
comments_df.to_csv('comments_data_top10_channels.csv')

In [13]:
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description       False
tags               True
publishedAt       False
viewCount          True
likeCount          True
favouriteCount     True
commentCount       True
duration          False
definition        False
caption           False
dtype: bool

In [14]:
video_df.publishedAt.sort_values().value_counts()

publishedAt
2020-02-27T13:00:00Z    2
2014-03-17T21:19:31Z    1
2019-12-18T13:00:03Z    1
2020-01-08T13:01:42Z    1
2020-01-08T13:01:14Z    1
                       ..
2017-04-18T13:29:21Z    1
2017-04-17T13:25:56Z    1
2017-04-14T14:48:18Z    1
2017-04-13T16:11:38Z    1
2023-11-06T13:00:31Z    1
Name: count, Length: 1644, dtype: int64

In [16]:
cols = ['viewCount', 'likeCount', 'commentCount']
video_df[cols] = video_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [17]:
# Create publish day (in the week) column
video_df['publishedAt'] =  video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A")) 

In [18]:
# convert duration to seconds
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [19]:
# Add number of tags
video_df['tagsCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [20]:

# Comments and likes per 1000 view ratio
video_df['likeRatio'] = video_df['likeCount']/ video_df['viewCount'] * 1000
video_df['commentRatio'] = video_df['commentCount']/ video_df['viewCount'] * 1000

In [21]:
# Title character length
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

In [22]:
video_df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,pushblishDayName,durationSecs,tagsCount,likeRatio,commentRatio,titleLength
0,UWhuHiL8Pug,Vox,Why everyone hates this concrete building,And why brutalism dominates US college campuse...,"[vox.com, vox, explain, brutalism, le corbusie...",2023-11-06 13:00:31+00:00,61121.0,3168.0,,303.0,PT8M3S,hd,True,Monday,0 days 00:08:03,20,51.831613,4.95738,41
1,z-eC9eoaRm4,Vox,How China is designing flood-resistant cities ...,These “sponge city” designs resist floods and ...,"[vox.com, vox, explain, climate, flooding, spo...",2023-10-31 15:49:59+00:00,366119.0,17032.0,,326.0,PT58S,hd,False,Tuesday,0 days 00:00:58,13,46.520394,0.890421,52
2,CoBeQzc4vQc,Vox,How The Conjuring became the Marvel of horror,The Warrens' case files have helped create a s...,"[Amityville Horror, Conjuring cinematic univer...",2023-10-31 12:00:14+00:00,152555.0,6427.0,,223.0,PT8M41S,hd,True,Tuesday,0 days 00:08:41,29,42.129068,1.461768,45
3,lmVakNtCYwQ,Vox,Why this 13th-century song shows up everywhere...,Linked to our full video to learn more!,"[vox.com, vox, explain, creepy melody, dies ir...",2023-10-30 18:00:48+00:00,184288.0,12327.0,,192.0,PT1M,hd,True,Monday,0 days 00:01:00,16,66.889868,1.041848,54
4,lv1SpwwJEW8,Vox,"Gaza, explained",Why Palestinians in Gaza have suffered for dec...,"[Gaza Strip, Palestinians, Vox.com, West Bank,...",2023-10-28 01:32:40+00:00,2731939.0,83514.0,,11998.0,PT15M51S,hd,True,Saturday,0 days 00:15:51,29,30.569497,4.391753,15


In [23]:
video_df.to_csv('video_data_top10_channels.csv')