In [64]:
import pandas as pd
import numpy as np
from dateutil import parser
import isodate
import time

# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)

# Google API
from googleapiclient.discovery import build

In [72]:
# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\coron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\coron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 2. Data creation with Youtube API

I first created a project on Google Developers Console, then requested an authorization credential (API key). Afterwards, I enabled Youtube API for my application, so that I can send API requests to Youtube API services. Then, I went on Youtube and checked the channel ID of each of the channels that I would like to include in my research scope (using their URLs). Then I created the functions for getting the channel statistics via the API.

In [79]:
api_key = 'AIzaSyBLc4kT4cab8zMHrkMzBHM6p2Uz20loIAI' 

channel_ids = ['UCX6OQ3DkcsbYNE6H8uQQuVA',]
#MR_BEAST=UCX6OQ3DkcsbYNE6H8uQQuVA
#PEWDIEPIE=UC-lHJZR3Gqxm24_Vd_AJ5Yw
#C0C0MELON=UCbCmjCuTUZos6Inko4u57UQ

youtube = build('youtube', 'v3', developerKey=api_key)

In [80]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    
    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
            
    return pd.DataFrame(all_video_info)

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

### Get channel statistics

Using the `get_channel_stats` function defined below, now we are going to obtain the channel statistics for the 9 channels in scope.

In [81]:
channel_data = get_channel_stats(youtube, channel_ids)

Now I can print out the data and take a look at the channel statistics overview.

In [82]:
channel_data

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,MrBeast,203000000,35244839851,761,UUX6OQ3DkcsbYNE6H8uQQuVA


I noticed the count columns in `channel_data` is currently in string format, so I will convert them into numeric so that we can visualize and do numeric operations on them.

In [83]:
# Convert count columns to numeric columns
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

Let's take a look at the number of subscribers per channel to have a view of how popular the channels are when compared with one another.

### Get video statistics for all the channels

In the next step, we will obtain the video statistics for all the channels.

In [84]:
# Create a dataframe with video statistics and comments from all channels
Total = channel_data['totalVideos'].sum()

video_df = pd.DataFrame()
comments_df = pd.DataFrame()
tic = time.perf_counter()
for c in channel_data['channelName'].unique():
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
    
    # get video data
    video_data = get_video_details(youtube, video_ids)
    # get comment data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether
    video_df = video_df._append(video_data, ignore_index=True)
    comments_df = comments_df._append(comments_data, ignore_index=True)
toc = time.perf_counter()
print(f"Obtained statistics in {toc - tic:0.4f} seconds for {Total} videos")

Getting video information from channel: MrBeast
Could not get comments for video AS5CxLCWq-Q
Obtained statistics in 158.2259 seconds for 761 videos


In [86]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,jGAJCAuV3pQ,MrBeast,World’s Most Expensive Coffee,,,2023-10-19T17:00:03Z,20347931,1418403,,9226,PT33S,hd,false
1,c0v8OPItCOg,MrBeast,"$100,000,000 Bathroom",,,2023-10-17T17:00:00Z,33835270,2325420,,8142,PT50S,hd,false
2,3ryID_SwU5E,MrBeast,"$1 Vs $100,000,000 House!",I can’t believe how expensive the last house i...,,2023-10-14T16:00:00Z,107015289,4119510,,121498,PT17M36S,hd,true
3,IemzxkkzI5s,MrBeast,I Tipped A Pizza Delivery Driver A Car,,,2023-10-12T17:30:03Z,93048737,6723184,,13569,PT52S,hd,false
4,3OFj6l2tQ9s,MrBeast,World's Most Dangerous Trap!,I can’t believe how crazy the last room is\nGr...,,2023-10-07T16:00:00Z,131231167,5107893,,147172,PT19M48S,hd,true
...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,7qj3nuF9Dzw,MrBeast,Most Epic minecraft skin EVER (Psy),Psy in minecraft!!! drop a like for psy's mo...,"[psy, minecraft, epic, skin, most, ever]",2013-01-13T01:59:21Z,815035,32191,,3078,PT31S,hd,false
757,Y74b7WlcEpk,MrBeast,More birds IN MINECRAFT!!,Basically what this mod does is adds more bird...,"[birds, minecraft, in, more, must, see, epic]",2013-01-12T23:35:45Z,945776,37543,,3368,PT2M6S,hd,false
758,Z8nEEdXTaX0,MrBeast,Boxy item mod Minecraft. EPIC,At the begining i said i was mrbeast6000.... i...,"[boxy, item, mod, minecraft, epic]",2013-01-12T22:34:11Z,1122431,44400,,4042,PT1M30S,hd,false
759,jP82d277Cc8,MrBeast,Harry Potter Mod In Minecraft! EPIC MUST SEE M...,One of the coolest mods i have ever seen\n\nMo...,"[Harry Potter minecraft, minecraft, minecraft ...",2012-03-09T23:29:03Z,3998009,,,8010,PT3M59S,hd,false


In [87]:
comments_df

Unnamed: 0,video_id,comments
0,jGAJCAuV3pQ,"[😅, Jeff, Who thing they should do richest pri..."
1,c0v8OPItCOg,"[Cuánto vendes el último😊, A egggg😂😂, QUE 🧍🏻‍♀..."
2,3ryID_SwU5E,[Make sure you/your parents give away Feastabl...
3,IemzxkkzI5s,[El saj legendario es canon y demasiado mas op...
4,3OFj6l2tQ9s,"[Subscribe so we can hit 200M!, no replies?, r..."
...,...,...
755,7qj3nuF9Dzw,"[Don’t ask, Omg, I've gone down enough, Imagín..."
756,Y74b7WlcEpk,"[I just got home from baseball practice lol, I..."
757,Z8nEEdXTaX0,[I bought a new mic but it turned out to be wo...
758,jP82d277Cc8,[I remember filming this with my horrible lapt...


In [88]:
# Write video data to CSV file for future references
video_df.to_csv('video_data.csv')
comments_df.to_csv('comments_data.csv')

## Preprocessing & Feature engineering

To be able to make use of the data for analysis, we need to perform a few pre-processing steps. Firstly, I would like reformat some columns, especially the date and time columns such as "pushlishedAt" and "duration". In addition, I also think it is necessary to enrich the data with some new features that might be useful for understanding the videos' characteristics.

### Check for empty values

In [89]:
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description       False
tags               True
publishedAt       False
viewCount         False
likeCount          True
favouriteCount     True
commentCount       True
duration          False
definition        False
caption           False
dtype: bool

In [90]:
video_df.publishedAt.sort_values().value_counts()

publishedAt
2012-02-20T22:42:32Z    1
2018-04-04T22:19:00Z    1
2018-03-01T23:09:03Z    1
2018-03-03T23:21:02Z    1
2018-03-08T00:13:02Z    1
                       ..
2015-07-21T18:51:04Z    1
2015-07-22T17:59:43Z    1
2015-07-23T10:00:00Z    1
2015-07-24T10:00:00Z    1
2023-10-19T17:00:03Z    1
Name: count, Length: 761, dtype: int64

Next, we need to check if the data type of the columns are correct. I have checked the data types and indeed some count columns such as view count and comment count are currently not in correct data type. In this step, we convert these count columns into integer.

In [91]:
cols = ['viewCount', 'likeCount', 'commentCount']
video_df[cols] = video_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

### Enriching data

I want to enrich the data for further analyses, for example:

- create published date column with another column showing the day in the week the video was published, which will be useful for later analysis.

- convert video duration to seconds instead of the current default string format

- calculate number of tags for each video

- calculate comments and likes per 1000 view ratio

- calculate title character length

In [92]:
# Create publish day (in the week) column
video_df['publishedAt'] =  video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A")) 

In [93]:
# convert duration to seconds
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [94]:
# Add number of tags
video_df['tagsCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [95]:
# Comments and likes per 1000 view ratio
video_df['likeRatio'] = video_df['likeCount']/ video_df['viewCount'] * 1000
video_df['commentRatio'] = video_df['commentCount']/ video_df['viewCount'] * 1000

In [96]:
# Title character length
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

Let's look at the video dataset at this point to see if everything went well. It looks good - now we will proceed to exploratory analysis part.

In [97]:
df = video_df.drop('favouriteCount', axis=1)
df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,commentCount,duration,definition,caption,pushblishDayName,durationSecs,tagsCount,likeRatio,commentRatio,titleLength
0,jGAJCAuV3pQ,MrBeast,World’s Most Expensive Coffee,,,2023-10-19 17:00:03+00:00,20347931.0,1418403.0,9226.0,PT33S,hd,False,Thursday,0 days 00:00:33,0,69.70748,0.453412,29
1,c0v8OPItCOg,MrBeast,"$100,000,000 Bathroom",,,2023-10-17 17:00:00+00:00,33835270.0,2325420.0,8142.0,PT50S,hd,False,Tuesday,0 days 00:00:50,0,68.727692,0.240636,21
2,3ryID_SwU5E,MrBeast,"$1 Vs $100,000,000 House!",I can’t believe how expensive the last house i...,,2023-10-14 16:00:00+00:00,107015289.0,4119510.0,121498.0,PT17M36S,hd,True,Saturday,0 days 00:17:36,0,38.494593,1.135333,25
3,IemzxkkzI5s,MrBeast,I Tipped A Pizza Delivery Driver A Car,,,2023-10-12 17:30:03+00:00,93048737.0,6723184.0,13569.0,PT52S,hd,False,Thursday,0 days 00:00:52,0,72.254436,0.145827,38
4,3OFj6l2tQ9s,MrBeast,World's Most Dangerous Trap!,I can’t believe how crazy the last room is\nGr...,,2023-10-07 16:00:00+00:00,131231167.0,5107893.0,147172.0,PT19M48S,hd,True,Saturday,0 days 00:19:48,0,38.922865,1.121471,28


In [98]:
#updated csv
# Write video data to CSV file for future references
video_df.to_csv('video_data_UP.csv')