In [22]:
import pandas as pd
import numpy as np
from dateutil import parser
import isodate
import time

# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)

# Google API
from googleapiclient.discovery import build

In [23]:
# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\coron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\coron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 2. Data creation with Youtube API

I first created a project on Google Developers Console, then requested an authorization credential (API key). Afterwards, I enabled Youtube API for my application, so that I can send API requests to Youtube API services. Then, I went on Youtube and checked the channel ID of each of the channels that I would like to include in my research scope (using their URLs). Then I created the functions for getting the channel statistics via the API.

In [24]:
api_key = 'AIzaSyAhAjl9v2pxr5M6i-HIIgXhM-sQmtj93E8' 

channel_ids = ['UCbCmjCuTUZos6Inko4u57UQ',]
#MR_BEAST=UCX6OQ3DkcsbYNE6H8uQQuVA
#PEWDIEPIE=UC-lHJZR3Gqxm24_Vd_AJ5Yw
#C0C0MELON=UCbCmjCuTUZos6Inko4u57UQ

youtube = build('youtube', 'v3', developerKey=api_key)

In [25]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    
    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
            
    return pd.DataFrame(all_video_info)

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

### Get channel statistics

Using the `get_channel_stats` function defined below, now we are going to obtain the channel statistics for the 9 channels in scope.

In [26]:
channel_data = get_channel_stats(youtube, channel_ids)

Now I can print out the data and take a look at the channel statistics overview.

In [27]:
channel_data

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,Cocomelon - Nursery Rhymes,166000000,170365245506,1021,UUbCmjCuTUZos6Inko4u57UQ


I noticed the count columns in `channel_data` is currently in string format, so I will convert them into numeric so that we can visualize and do numeric operations on them.

In [28]:
# Convert count columns to numeric columns
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

Let's take a look at the number of subscribers per channel to have a view of how popular the channels are when compared with one another.

### Get video statistics for all the channels

In the next step, we will obtain the video statistics for all the channels.

In [29]:
# Create a dataframe with video statistics and comments from all channels
Total = channel_data['totalVideos'].sum()

video_df = pd.DataFrame()
comments_df = pd.DataFrame()
tic = time.perf_counter()
for c in channel_data['channelName'].unique():
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
    
    # get video data
    video_data = get_video_details(youtube, video_ids)
    # get comment data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether
    video_df = video_df._append(video_data, ignore_index=True)
    comments_df = comments_df._append(comments_data, ignore_index=True)
toc = time.perf_counter()
print(f"Obtained statistics in {toc - tic:0.4f} seconds for {Total} videos")

Getting video information from channel: Cocomelon - Nursery Rhymes
Could not get comments for video P7dqib_1D74
Could not get comments for video WPiMSL8RZok
Could not get comments for video E6nD3mIA-ro
Could not get comments for video 5n_VLmrXFwo
Could not get comments for video yTe-SP4qRQQ
Could not get comments for video 75hzv1-EELc
Could not get comments for video r-ai2Co9cEo
Could not get comments for video LsIxm0w_-ss
Could not get comments for video 0GAhfV_5Ic0
Could not get comments for video XUfz66gv1WQ
Could not get comments for video jf3DffO0WVY
Could not get comments for video lVlj0QxpEJo
Could not get comments for video DEIDOw_sFI4
Could not get comments for video DidWsh298E0
Could not get comments for video hz6dDpxDI1A
Could not get comments for video zvPpy8omGWQ
Could not get comments for video C3O8mBxCArI
Could not get comments for video ev59AUFhM_c
Could not get comments for video qgCJwb8ZNL4
Could not get comments for video bDEqQi6x09M
Could not get comments for video 

In [30]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,P7dqib_1D74,Cocomelon - Nursery Rhymes,Finger Family Halloween + Trick or Treat Song ...,It's almost Halloween! Everybody do the MUMMY ...,"[Halloween Costume for kids, JJ, abckidtv, bab...",2023-10-21T07:00:10Z,751726,5415,,0,PT1H1M51S,hd,false
1,WPiMSL8RZok,Cocomelon - Nursery Rhymes,Let's Catch Autumn Leaves! @CoComelon #Shorts ...,Happy Fall from @CoComelon! Let's catch the fa...,"[cocomelon, abckidtv, nursery rhymes, children...",2023-10-20T07:00:13Z,417066,11877,,0,PT28S,hd,false
2,E6nD3mIA-ro,Cocomelon - Nursery Rhymes,Mermaid at the Beach Song | CoComelon Nursery ...,Can you swim like a mermaid and chomp like a b...,"[JJ, abckidtv, alexa play cocomelon, baby song...",2023-10-17T07:00:06Z,5665817,21400,,0,PT3M,hd,false
3,5n_VLmrXFwo,Cocomelon - Nursery Rhymes,Halloween Costume Song + Wheels on the Bus Hal...,Get your Halloween costumes READY! Happy Hallo...,"[Halloween Costume for kids, JJ, abckidtv, bab...",2023-10-14T07:00:07Z,3729699,19015,,0,PT47M23S,hd,false
4,yTe-SP4qRQQ,Cocomelon - Nursery Rhymes,Puppy Pretend Time 🐶! @CoComelon#Shorts #puppy...,JJ and Nina pretend to be puppies! First you h...,"[cocomelon, abckidtv, nursery rhymes, children...",2023-10-13T07:00:25Z,3123687,84313,,0,PT21S,hd,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,tVpgEiBcw7M,Cocomelon - Nursery Rhymes,"Learn the ABCs: ""P"" is for Pig and Penguin","Featuring the letter ""P""! \nThis series goes t...","[baby songs, sing-along, abckidtv, preschool, ...",2007-06-20T03:41:46Z,9765764,5077,,0,PT1M31S,sd,false
1018,7W6fEFixi5U,Cocomelon - Nursery Rhymes,"Learn the ABCs: ""L"" is for Lion and Ladybug","Featuring the letter ""L""! \nThis series goes t...","[baby songs, kids education, kindergarten, chi...",2007-06-20T03:34:33Z,24753403,21503,,0,PT1M48S,sd,false
1019,cgC8BC1OINQ,Cocomelon - Nursery Rhymes,"Learn the ABCs: ""K"" is for Kangaroo","Featuring the letter ""K""! \nThis series goes t...","[cocomelon, kids entertainment, kindergarten, ...",2007-06-20T01:31:32Z,8865285,4567,,0,PT2M13S,sd,false
1020,0fw3l1z9CUQ,Cocomelon - Nursery Rhymes,ABC Song with Cute Ending,This ABC Song is one of the most popular ABC s...,"[toddler, children songs, abckidtv, kids video...",2006-09-02T01:34:53Z,298119514,332847,,0,PT45S,sd,false


In [31]:
comments_df

In [32]:
# Write video data to CSV file for future references
video_df.to_csv('video_data.csv')
comments_df.to_csv('comments_data.csv')

## Preprocessing & Feature engineering

To be able to make use of the data for analysis, we need to perform a few pre-processing steps. Firstly, I would like reformat some columns, especially the date and time columns such as "pushlishedAt" and "duration". In addition, I also think it is necessary to enrich the data with some new features that might be useful for understanding the videos' characteristics.

### Check for empty values

In [33]:
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description       False
tags               True
publishedAt       False
viewCount         False
likeCount         False
favouriteCount     True
commentCount      False
duration          False
definition        False
caption           False
dtype: bool

In [34]:
video_df.publishedAt.sort_values().value_counts()

publishedAt
2006-09-01T23:39:09Z    1
2021-09-04T07:00:02Z    1
2021-08-05T07:00:08Z    1
2021-08-07T07:00:08Z    1
2021-08-10T07:00:30Z    1
                       ..
2018-08-21T07:00:05Z    1
2018-08-24T07:00:10Z    1
2018-08-28T07:00:02Z    1
2018-09-04T07:00:02Z    1
2023-10-21T07:00:10Z    1
Name: count, Length: 1022, dtype: int64

Next, we need to check if the data type of the columns are correct. I have checked the data types and indeed some count columns such as view count and comment count are currently not in correct data type. In this step, we convert these count columns into integer.

In [35]:
cols = ['viewCount', 'likeCount', 'commentCount']
video_df[cols] = video_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

### Enriching data

I want to enrich the data for further analyses, for example:

- create published date column with another column showing the day in the week the video was published, which will be useful for later analysis.

- convert video duration to seconds instead of the current default string format

- calculate number of tags for each video

- calculate comments and likes per 1000 view ratio

- calculate title character length

In [36]:
# Create publish day (in the week) column
video_df['publishedAt'] =  video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A")) 

In [37]:
# convert duration to seconds
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [38]:
# Add number of tags
video_df['tagsCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [39]:
# Comments and likes per 1000 view ratio
video_df['likeRatio'] = video_df['likeCount']/ video_df['viewCount'] * 1000
video_df['commentRatio'] = video_df['commentCount']/ video_df['viewCount'] * 1000

In [40]:
# Title character length
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

Let's look at the video dataset at this point to see if everything went well. It looks good - now we will proceed to exploratory analysis part.

In [41]:
df = video_df.drop('favouriteCount', axis=1)
df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,commentCount,duration,definition,caption,pushblishDayName,durationSecs,tagsCount,likeRatio,commentRatio,titleLength
0,P7dqib_1D74,Cocomelon - Nursery Rhymes,Finger Family Halloween + Trick or Treat Song ...,It's almost Halloween! Everybody do the MUMMY ...,"[Halloween Costume for kids, JJ, abckidtv, bab...",2023-10-21 07:00:10+00:00,751726,5415,0,PT1H1M51S,hd,False,Saturday,0 days 01:01:51,31,7.203423,0.0,90
1,WPiMSL8RZok,Cocomelon - Nursery Rhymes,Let's Catch Autumn Leaves! @CoComelon #Shorts ...,Happy Fall from @CoComelon! Let's catch the fa...,"[cocomelon, abckidtv, nursery rhymes, children...",2023-10-20 07:00:13+00:00,417066,11877,0,PT28S,hd,False,Friday,0 days 00:00:28,28,28.477507,0.0,60
2,E6nD3mIA-ro,Cocomelon - Nursery Rhymes,Mermaid at the Beach Song | CoComelon Nursery ...,Can you swim like a mermaid and chomp like a b...,"[JJ, abckidtv, alexa play cocomelon, baby song...",2023-10-17 07:00:06+00:00,5665817,21400,0,PT3M,hd,False,Tuesday,0 days 00:03:00,33,3.777037,0.0,65
3,5n_VLmrXFwo,Cocomelon - Nursery Rhymes,Halloween Costume Song + Wheels on the Bus Hal...,Get your Halloween costumes READY! Happy Hallo...,"[Halloween Costume for kids, JJ, abckidtv, bab...",2023-10-14 07:00:07+00:00,3729699,19015,0,PT47M23S,hd,False,Saturday,0 days 00:47:23,34,5.098267,0.0,84
4,yTe-SP4qRQQ,Cocomelon - Nursery Rhymes,Puppy Pretend Time 🐶! @CoComelon#Shorts #puppy...,JJ and Nina pretend to be puppies! First you h...,"[cocomelon, abckidtv, nursery rhymes, children...",2023-10-13 07:00:25+00:00,3123687,84313,0,PT21S,hd,False,Friday,0 days 00:00:21,28,26.991501,0.0,57


In [42]:
#updated csv
# Write video data to CSV file for future references
video_df.to_csv('video_data_UP.csv')