In [1]:
import pandas as pd
import numpy as np
from dateutil import parser
import isodate
import time

# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)

# Google API
from googleapiclient.discovery import build

In [2]:
# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
api_key = 'AIzaSyAhAjl9v2pxr5M6i-HIIgXhM-sQmtj93E8' 

channel_ids = ['UC4-79UOlP48-QNGgCko5p2g',]
#MR_BEAST=UCX6OQ3DkcsbYNE6H8uQQuVA
#PEWDIEPIE=UC-lHJZR3Gqxm24_Vd_AJ5Yw
#C0C0MELON=UCbCmjCuTUZos6Inko4u57UQ
# DaFuq!?Boom! = UCsSsgPaZ2GSmO6il8Cb5iGA
# mrbeast2 = UC4-79UOlP48-QNGgCko5p2g

youtube = build('youtube', 'v3', developerKey=api_key)

In [4]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    
    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
            
    return pd.DataFrame(all_video_info)

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

### Get channel statistics

Using the `get_channel_stats` function defined below, now we are going to obtain the channel statistics for the 9 channels in scope.

In [5]:
channel_data = get_channel_stats(youtube, channel_ids)

Now I can print out the data and take a look at the channel statistics overview.

In [6]:
channel_data

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,MrBeast 2,33900000,5726096120,114,UU4-79UOlP48-QNGgCko5p2g


I noticed the count columns in `channel_data` is currently in string format, so I will convert them into numeric so that we can visualize and do numeric operations on them.

In [7]:
# Convert count columns to numeric columns
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

Let's take a look at the number of subscribers per channel to have a view of how popular the channels are when compared with one another.

### Get video statistics for all the channels

In the next step, we will obtain the video statistics for all the channels.

In [8]:
# Create a dataframe with video statistics and comments from all channels
Total = channel_data['totalVideos'].sum()

video_df = pd.DataFrame()
comments_df = pd.DataFrame()
tic = time.perf_counter()
for c in channel_data['channelName'].unique():
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
    
    # get video data
    video_data = get_video_details(youtube, video_ids)
    # get comment data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether
    video_df = video_df._append(video_data, ignore_index=True)
    comments_df = comments_df._append(comments_data, ignore_index=True)
toc = time.perf_counter()
print(f"Obtained statistics in {toc - tic:0.4f} seconds for {Total} videos")

Getting video information from channel: MrBeast 2
Could not get comments for video bl-bNgiy13Q
Obtained statistics in 27.8710 seconds for 114 videos


In [9]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,RKT7xqPDy74,MrBeast 2,"I Asked Mark Cuban For $1,000,000",,,2023-10-22T17:00:00Z,13735868,990258,,3927,PT19S,hd,false
1,SHOV58tdNiI,MrBeast 2,Miranda Cosgrove Said What?,,,2023-10-15T17:00:00Z,57338629,3562307,,21805,PT15S,hd,false
2,SGzWIoLAzmY,MrBeast 2,"Feeding A Cat $10 Vs $10,000 Sushi",,,2023-08-31T17:00:00Z,145417200,7194440,,16227,PT31S,hd,false
3,sHX6pzhdCP0,MrBeast 2,"$30,000 Relay Race",Play Stumble Guys Now!:\n\nhttps://stumbleguys...,,2023-08-13T17:20:26Z,6501124,471223,,2527,PT42S,hd,false
4,ykRq_rp7NhM,MrBeast 2,I Got Naruto to Subscribe to Me,,,2023-05-20T20:00:00Z,127882331,6737523,,15606,PT20S,hd,true
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,wdznN3-h_5Y,MrBeast 2,"Launching a $40,000 Firework!","We launched a $40,000 firework and it was insa...",,2020-09-11T19:14:35Z,11567603,675796,,12118,PT24S,hd,false
111,h-EL3eeSZeU,MrBeast 2,Unlimited Money Machine,#Shorts,,2020-08-24T23:31:38Z,62731027,2863394,,29403,PT22S,hd,false
112,RBB9GSMkLHc,MrBeast 2,How To Never Lose Rock Paper Scissors Again!,#Shorts,,2020-08-23T22:07:27Z,43022260,1933883,,323125,PT14S,hd,false
113,xdmuHe_ZDIw,MrBeast 2,World's Longest Whopper Train! LOL,We did the funny... subscribe for more videos!...,,2020-08-21T21:48:56Z,72401695,1141783,,8835,PT15S,hd,false


In [10]:
comments_df

Unnamed: 0,video_id,comments
0,RKT7xqPDy74,"[mala decisi√≥n, ü§£ü§£no lo se ...POR DIVERSIONü§£ü§£ü§£..."
1,SHOV58tdNiI,"[Pora, ü§£ü§£ü§£ü§£ü§£ü§£üëçüëëüåª, Porra üòî, üéâüéâüéâüéâüéâüòÇoi, ü§£üòÇü§£üòÉüòÇü§£üòÇü§£üòÇ..."
2,SGzWIoLAzmY,"[""mas vc e minha fam√≠lia üò¢"" q dooooooooo aaaaa..."
3,sHX6pzhdCP0,"[Check my map, skins, and emotes out :^)\nhttp..."
4,ykRq_rp7NhM,"[Imaging feature, –û—Ä–∞ –ú—Ä –±–∏—Å—Ç –∫—É–¥–∞—Å–∞–π –∞—Ö–∞—Ö–∞—Ö–∞..."
...,...,...
109,wdznN3-h_5Y,"[Subscribe for more shorts!, Wow ‚ù§‚ù§, Free Free..."
110,h-EL3eeSZeU,"[‚ù§‚ù§‚ù§‚ù§‚ù§‚ù§‚ù§‚ù§‚ù§‚ù§‚ù§‚ù§, Chandler es como yo xd Con comi..."
111,RBB9GSMkLHc,"[ez win now subscribe, TieüòÖ, Draw, Tie, Tieeee..."
112,xdmuHe_ZDIw,"[‚ù§‚ù§, ü•≥üëå, ‚úåÔ∏èüëåüëåüò±üò±ü§ó, –ß–õ–ï–ù –ü–ò–°–Ø –ö–ê–ö–ê, Whoppers are..."


In [11]:
# Write video data to CSV file for future references
video_df.to_csv('video_data.csv')
comments_df.to_csv('comments_data.csv')

## Preprocessing & Feature engineering

To be able to make use of the data for analysis, we need to perform a few pre-processing steps. Firstly, I would like reformat some columns, especially the date and time columns such as "pushlishedAt" and "duration". In addition, I also think it is necessary to enrich the data with some new features that might be useful for understanding the videos' characteristics.

### Check for empty values

In [12]:
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description       False
tags               True
publishedAt       False
viewCount         False
likeCount         False
favouriteCount     True
commentCount      False
duration          False
definition        False
caption           False
dtype: bool

In [13]:
video_df.publishedAt.sort_values().value_counts()

2020-07-30T20:00:28Z    1
2022-08-23T20:00:00Z    1
2022-08-13T15:00:02Z    1
2022-07-28T21:23:28Z    1
2022-07-12T18:37:45Z    1
                       ..
2021-05-02T19:00:02Z    1
2021-04-25T20:00:18Z    1
2021-04-20T20:00:02Z    1
2021-04-18T20:01:07Z    1
2023-10-22T17:00:00Z    1
Name: publishedAt, Length: 115, dtype: int64

Next, we need to check if the data type of the columns are correct. I have checked the data types and indeed some count columns such as view count and comment count are currently not in correct data type. In this step, we convert these count columns into integer.

In [14]:
cols = ['viewCount', 'likeCount', 'commentCount']
video_df[cols] = video_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

### Enriching data

I want to enrich the data for further analyses, for example:

- create published date column with another column showing the day in the week the video was published, which will be useful for later analysis.

- convert video duration to seconds instead of the current default string format

- calculate number of tags for each video

- calculate comments and likes per 1000 view ratio

- calculate title character length

In [15]:
# Create publish day (in the week) column
video_df['publishedAt'] =  video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A")) 

In [16]:
# convert duration to seconds
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [17]:
# Add number of tags
video_df['tagsCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [18]:
# Comments and likes per 1000 view ratio
video_df['likeRatio'] = video_df['likeCount']/ video_df['viewCount'] * 1000
video_df['commentRatio'] = video_df['commentCount']/ video_df['viewCount'] * 1000

In [19]:
# Title character length
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

Let's look at the video dataset at this point to see if everything went well. It looks good - now we will proceed to exploratory analysis part.

In [20]:
df = video_df.drop('favouriteCount', axis=1)
df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,commentCount,duration,definition,caption,pushblishDayName,durationSecs,tagsCount,likeRatio,commentRatio,titleLength
0,RKT7xqPDy74,MrBeast 2,"I Asked Mark Cuban For $1,000,000",,,2023-10-22 17:00:00+00:00,13735868,990258,3927,PT19S,hd,False,Sunday,19.0,0,72.092859,0.285894,33
1,SHOV58tdNiI,MrBeast 2,Miranda Cosgrove Said What?,,,2023-10-15 17:00:00+00:00,57338629,3562307,21805,PT15S,hd,False,Sunday,15.0,0,62.127523,0.380285,27
2,SGzWIoLAzmY,MrBeast 2,"Feeding A Cat $10 Vs $10,000 Sushi",,,2023-08-31 17:00:00+00:00,145417200,7194440,16227,PT31S,hd,False,Thursday,31.0,0,49.474478,0.111589,34
3,sHX6pzhdCP0,MrBeast 2,"$30,000 Relay Race",Play Stumble Guys Now!:\n\nhttps://stumbleguys...,,2023-08-13 17:20:26+00:00,6501124,471223,2527,PT42S,hd,False,Sunday,42.0,0,72.483312,0.388702,18
4,ykRq_rp7NhM,MrBeast 2,I Got Naruto to Subscribe to Me,,,2023-05-20 20:00:00+00:00,127882331,6737523,15606,PT20S,hd,True,Saturday,20.0,0,52.685331,0.122034,31


In [21]:
#updated csv
# Write video data to CSV file for future references
video_df.to_csv('video_data_UP.csv')