In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np
from dateutil import parser
import isodate

# Google API
from googleapiclient.discovery import build

In [2]:
# Change this for personal experimentation
api_key = 'AIzaSyBhoEbvc5gPclcRqj4_k7DyW5xR3_nsGiQ'

channel_ids = ['UCOQEnlUWNYMdHb0MgmcCpmg',
              'UCNhN8uvdNTY9O8liLP-94mg',
              ]

youtube = build('youtube', 'v3', developerKey=api_key)

In [3]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    
    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
            
    return pd.DataFrame(all_video_info)

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     


In [4]:
channel_data = get_channel_stats(youtube, channel_ids)

In [5]:
channel_data

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,UN Climate Change:Learn,6590,197562,118,UUNhN8uvdNTY9O8liLP-94mg
1,ClimateScience - Solve Climate Change,6400,421965,47,UUOQEnlUWNYMdHb0MgmcCpmg


In [6]:
import pandas as pd

video_df = pd.DataFrame()  # Initialize an empty DataFrame for video data
comments_df = pd.DataFrame()  # Initialize an empty DataFrame for comments data

for c in channel_data['channelName'].unique():
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName'] == c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
    
    # get video data
    video_data = get_video_details(youtube, video_ids)
    # get comment data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # Append video data and comment data to the respective DataFrames
    video_df = pd.concat([video_df, pd.DataFrame(video_data)], ignore_index=True)
    comments_df = pd.concat([comments_df, pd.DataFrame(comments_data)], ignore_index=True)


Getting video information from channel: UN Climate Change:Learn
Could not get comments for video aTD6XozFll8
Could not get comments for video yx_VZya_7VA
Getting video information from channel: ClimateScience - Solve Climate Change
Could not get comments for video nWn90yKQFHs
Could not get comments for video u7N60AwiddI
Could not get comments for video rndxwsxrBJ0


In [7]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,m9D-ZM1OBFs,UN Climate Change:Learn,Malawian women in agriculture,In some African countries it is very common th...,"[#sustainabledevelopment, #sustainability, #sd...",2023-08-29T10:37:28Z,339,14,,0,PT21S,hd,false
1,o1YZynJ3p54,UN Climate Change:Learn,Youth Day - UN Climate Change: Learn #youthday...,Today is #YouthDay and we’ve got a message for...,"[environment, motivational, vision, solutions,...",2023-08-12T12:12:43Z,400,6,,0,PT1M,hd,false
2,qswkjcDHm1I,UN Climate Change:Learn,Ecosystem-based Adaptation (EbA),The importance of planning with #nature for cl...,"[#naturebasedsolutions, #climatechange #nature...",2023-08-10T07:55:14Z,133,8,,0,PT1M,hd,false
3,vjN63yduW3U,UN Climate Change:Learn,"Waste collectors in Agbogbloshie, Ghana. #shorts","📹 Waste collectors in Agbogbloshie, Ghana, loo...","[#ewaste, #recycle, recycling, #sustainability...",2023-05-25T07:55:11Z,2410,37,,0,PT18S,hd,false
4,IHzmJuyunOM,UN Climate Change:Learn,International Climate Change Negotiations_cour...,Climate negotiations define long-term cooperat...,"[climate change, climate action, green, enviro...",2023-01-27T12:41:14Z,497,4,,1,PT38S,hd,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,WWX5oOrUd9A,ClimateScience - Solve Climate Change,ClimateScience Olympiad,Register at https://climatescience.org/olympia...,,2021-02-16T10:40:03Z,13543,231,,21,PT1M19S,hd,true
163,XEcgieSBvjw,ClimateScience - Solve Climate Change,Carbon Tax: Would paying for CO2 emissions hur...,Can we make companies pay for climate change? ...,,2021-02-02T19:02:41Z,6190,218,,45,PT2M31S,hd,true
164,Bxd8y03zETI,ClimateScience - Solve Climate Change,How Scientists KNOW Climate Change is Real | C...,How do we know climate change is real? You’ll ...,,2021-01-18T09:30:12Z,16377,295,,42,PT3M35S,hd,true
165,myZAvqqp9Jc,ClimateScience - Solve Climate Change,Climate Change: How does it really work? | Cli...,"You’ve seen it in the news, heard it on the ra...",,2020-12-29T16:14:25Z,87578,759,,70,PT3M7S,hd,true


In [8]:
comments_df

Unnamed: 0,video_id,comments
0,m9D-ZM1OBFs,[]
1,o1YZynJ3p54,[]
2,qswkjcDHm1I,[]
3,vjN63yduW3U,[]
4,IHzmJuyunOM,[With joint efforts we can reduce the effects ...
...,...,...
157,WWX5oOrUd9A,[A general question: how is global CO2 level d...
158,XEcgieSBvjw,[This is all BS .... all this tax does is make...
159,Bxd8y03zETI,"[😂Bullshit, Warming is science. The climate ""c..."
160,myZAvqqp9Jc,[Man's CO2 contributions to our atmosphere are...


In [9]:
# Write video data to CSV file for future references
video_df.to_csv('video_data_top10_channels.csv')
comments_df.to_csv('comments_data_top10_channels.csv')

In [10]:
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description       False
tags               True
publishedAt       False
viewCount         False
likeCount         False
favouriteCount     True
commentCount       True
duration          False
definition        False
caption           False
dtype: bool

In [11]:
video_df.publishedAt.sort_values().value_counts()

publishedAt
2014-01-09T17:00:26Z    1
2021-05-17T09:43:15Z    1
2021-06-17T13:09:13Z    1
2021-06-18T13:06:16Z    1
2021-06-18T13:06:21Z    1
                       ..
2019-02-01T10:53:23Z    1
2019-03-06T16:21:15Z    1
2019-04-25T08:12:40Z    1
2019-05-22T10:18:09Z    1
2023-08-29T10:37:28Z    1
Name: count, Length: 167, dtype: int64

In [12]:
cols = ['viewCount', 'likeCount', 'commentCount']
video_df[cols] = video_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [13]:
# Create publish day (in the week) column
video_df['publishedAt'] =  video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A")) 

In [14]:
# convert duration to seconds
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [15]:
# Add number of tags
video_df['tagsCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [16]:

# Comments and likes per 1000 view ratio
video_df['likeRatio'] = video_df['likeCount']/ video_df['viewCount'] * 1000
video_df['commentRatio'] = video_df['commentCount']/ video_df['viewCount'] * 1000

In [17]:
# Title character length
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

In [18]:
video_df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,pushblishDayName,durationSecs,tagsCount,likeRatio,commentRatio,titleLength
0,m9D-ZM1OBFs,UN Climate Change:Learn,Malawian women in agriculture,In some African countries it is very common th...,"[#sustainabledevelopment, #sustainability, #sd...",2023-08-29 10:37:28+00:00,339.0,14.0,,0.0,PT21S,hd,False,Tuesday,0 days 00:00:21,18,41.297935,0.0,29
1,o1YZynJ3p54,UN Climate Change:Learn,Youth Day - UN Climate Change: Learn #youthday...,Today is #YouthDay and we’ve got a message for...,"[environment, motivational, vision, solutions,...",2023-08-12 12:12:43+00:00,400.0,6.0,,0.0,PT1M,hd,False,Saturday,0 days 00:01:00,38,15.0,0.0,84
2,qswkjcDHm1I,UN Climate Change:Learn,Ecosystem-based Adaptation (EbA),The importance of planning with #nature for cl...,"[#naturebasedsolutions, #climatechange #nature...",2023-08-10 07:55:14+00:00,133.0,8.0,,0.0,PT1M,hd,False,Thursday,0 days 00:01:00,17,60.150376,0.0,32
3,vjN63yduW3U,UN Climate Change:Learn,"Waste collectors in Agbogbloshie, Ghana. #shorts","📹 Waste collectors in Agbogbloshie, Ghana, loo...","[#ewaste, #recycle, recycling, #sustainability...",2023-05-25 07:55:11+00:00,2410.0,37.0,,0.0,PT18S,hd,False,Thursday,0 days 00:00:18,20,15.352697,0.0,48
4,IHzmJuyunOM,UN Climate Change:Learn,International Climate Change Negotiations_cour...,Climate negotiations define long-term cooperat...,"[climate change, climate action, green, enviro...",2023-01-27 12:41:14+00:00,497.0,4.0,,1.0,PT38S,hd,False,Friday,0 days 00:00:38,41,8.04829,2.012072,56


In [19]:
video_df.to_csv('video_data_top10_channels.csv')