In [1]:
import pandas as pd
import numpy as np
from dateutil import parser
import isodate
import time

# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)

# Google API
from googleapiclient.discovery import build

In [2]:
# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
api_key = 'AIzaSyAhAjl9v2pxr5M6i-HIIgXhM-sQmtj93E8' 

channel_ids = ['UC295-Dw_tDNtZXFeAPAW6Aw',]
#MR_BEAST=UCX6OQ3DkcsbYNE6H8uQQuVA
#PEWDIEPIE=UC-lHJZR3Gqxm24_Vd_AJ5Yw
#C0C0MELON=UCbCmjCuTUZos6Inko4u57UQ
# DaFuq!?Boom! = UCsSsgPaZ2GSmO6il8Cb5iGA
# mrbeast2 = UC4-79UOlP48-QNGgCko5p2g
# 5 minute crafts = UC295-Dw_tDNtZXFeAPAW6Aw

youtube = build('youtube', 'v3', developerKey=api_key)

In [4]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs
    
    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
    
    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute() 
    
    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)
    
    return pd.DataFrame(all_data)

def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:
    
    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel
    
    Returns:
    List of video IDs of all videos in the playlist
    
    """
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()
    
    video_ids = []
    
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """
        
    all_video_info = []
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
            
    return pd.DataFrame(all_video_info)

def get_comments_in_videos(youtube, video_ids):
    """
    Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
    Params:
    
    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs
    
    Returns:
    Dataframe with video IDs and associated top level comment in text.
    
    """
    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)     

### Get channel statistics

Using the `get_channel_stats` function defined below, now we are going to obtain the channel statistics for the 9 channels in scope.

In [5]:
channel_data = get_channel_stats(youtube, channel_ids)

Now I can print out the data and take a look at the channel statistics overview.

In [6]:
channel_data

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,5-Minute Crafts,80400000,26724491109,6688,UU295-Dw_tDNtZXFeAPAW6Aw


I noticed the count columns in `channel_data` is currently in string format, so I will convert them into numeric so that we can visualize and do numeric operations on them.

In [7]:
# Convert count columns to numeric columns
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

Let's take a look at the number of subscribers per channel to have a view of how popular the channels are when compared with one another.

### Get video statistics for all the channels

In the next step, we will obtain the video statistics for all the channels.

In [8]:
# Create a dataframe with video statistics and comments from all channels
Total = channel_data['totalVideos'].sum()

video_df = pd.DataFrame()
comments_df = pd.DataFrame()
tic = time.perf_counter()
for c in channel_data['channelName'].unique():
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)
    
    # get video data
    video_data = get_video_details(youtube, video_ids)
    # get comment data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether
    video_df = video_df._append(video_data, ignore_index=True)
    comments_df = comments_df._append(comments_data, ignore_index=True)
toc = time.perf_counter()
print(f"Obtained statistics in {toc - tic:0.4f} seconds for {Total} videos")

Getting video information from channel: 5-Minute Crafts
Could not get comments for video y838QNcu7EI
Could not get comments for video lIdBkW_Aoc8
Could not get comments for video l-E4vVnST2E
Could not get comments for video Nf_XgSMTzxk
Could not get comments for video wpNGbGVxaNQ
Could not get comments for video q1CVwHFYXPY
Could not get comments for video KFrKFwOlU24
Could not get comments for video HyZew0syq-4
Could not get comments for video A0kea4RT6AY
Could not get comments for video S9sV83Yr3N4
Could not get comments for video -DpqX0it7Vc
Could not get comments for video QTlcC3kJjLA
Could not get comments for video IzW0B6qQKWI
Could not get comments for video SAT0_6hF0E8
Could not get comments for video RaRoELK_yX8
Could not get comments for video hZSH3cGFSgM
Could not get comments for video -ykQjuwofUU
Could not get comments for video xzygCwBPrUM
Could not get comments for video 8EBfCXCzaog
Could not get comments for video ORFFrm4ifLY
Could not get comments for video YzcpGFhR_4U

Could not get comments for video LdNYUN8ErU4
Could not get comments for video 2unXc2i_m6g
Could not get comments for video IHf438Zsiyo
Could not get comments for video niJMY28BHNw
Could not get comments for video UBSWRZhkqYw
Could not get comments for video cVCcS1qL_7U
Could not get comments for video 9Z8K5G-ugIw
Could not get comments for video c4Wpo-DJ52M
Could not get comments for video Dd2V9mfvJ38
Could not get comments for video qxpZew_i9UM
Could not get comments for video Rumaxw23G2I
Could not get comments for video gy0WM6T5rgc
Could not get comments for video dTgLkl2asZQ
Could not get comments for video PnYTzjejhhs
Could not get comments for video nd11xjWon5Y
Could not get comments for video buxZquyWizw
Could not get comments for video N8d_CTR-t50
Could not get comments for video hC4zjHrW8gk
Could not get comments for video HgGwcheB1kw
Could not get comments for video ZenlQ_B7uOQ
Could not get comments for video Od99jv-evGQ
Could not get comments for video -gPWT3AAaxc
Could not 

Could not get comments for video Ohd7jGr26xk
Could not get comments for video KobgZiEQScw
Could not get comments for video 3nf1qBln2gk
Could not get comments for video D_IUYLh4viY
Could not get comments for video Z0WyI2obU9Y
Could not get comments for video znManE9_Xk4
Could not get comments for video x_wi_XTdmcE
Obtained statistics in 1757.9248 seconds for 6688 videos


In [9]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,_lpfFbYnbOM,5-Minute Crafts,Illuminate Your World with Rock Art! 🌟✨ #shorts,Get Your Rock Painting Kit Now at Walmart! ➡️ ...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-17T17:00:35Z,11513,638,,4,PT31S,hd,false
1,NAW3bdt3xh8,5-Minute Crafts,Easy School Crafts for Creative Students 📚✂️,Hi friends! 🧩🍭 Welcome to our creative corner!...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-17T13:00:19Z,25981,309,,25,PT1H49S,hd,false
2,fgUr6kfnv9c,5-Minute Crafts,DIY Jewelry Ideas ✨💍Unique Accessories You Can...,Hi everyone! 🤩🤗 Welcome to a world of creativi...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-17T11:00:40Z,15873,212,,13,PT8M59S,hd,false
3,B_x5rqgxqmc,5-Minute Crafts,🌟✂️ 5-Minute Crafts Historic Event! ✂️🌟 #short...,🌟✂️ 5-Minute Crafts Historic Event! ✂️🌟\r\n🛒 N...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-16T15:00:09Z,70549,1511,,7,PT15S,hd,false
4,tyLhe5QnebM,5-Minute Crafts,Delicious Snack and Treat Ideas That Are Quick...,Hi everyone! 💗🍭 Join us in this mouthwatering ...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-16T13:00:48Z,48395,437,,30,PT1H2S,hd,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6683,RA9GjoRhcSE,5-Minute Crafts,Amazing life hack; how to recycle old crayons ...,Amazing life hack; how to recycle old crayons ...,"[how to, cryons, crayons, short video, turoria...",2016-11-17T14:52:05Z,89421,1078,,57,PT55S,hd,false
6684,KtetRv4kfog,5-Minute Crafts,How to DIY a magnetic memo holder l 5-MINUTE C...,How to DIY a magnetic memo holder l 5-MINUTE C...,"[fridge, home decor, how to, tutorial, homedec...",2016-11-17T11:39:53Z,75419,1535,,94,PT52S,hd,false
6685,AEp338eSyDM,5-Minute Crafts,How to turn your old T-shirt into a pillowcase...,How to turn a T-shirt into a pillowcase (NO SE...,"[tshirt, lifehack, life hack, DIY, home decor,...",2016-11-17T11:05:23Z,157822,2549,,167,PT1M2S,hd,false
6686,JmFQavE0sPM,5-Minute Crafts,3 tricks to clean without harsh chemicals l 5-...,3 tricks to clean without harsh chemicals l 5-...,"[5minutecrafts, crafts, chemicals, lifehack, l...",2016-11-16T21:20:07Z,105074,2113,,216,PT1M21S,hd,false


In [10]:
comments_df

Unnamed: 0,video_id,comments
0,_lpfFbYnbOM,"[Finally saying something, I dont know, Nice, ..."
1,NAW3bdt3xh8,[You could easily make all of these crafts fro...
2,fgUr6kfnv9c,"[Hello friends, I’m gaymer, but I don’t suppor..."
3,B_x5rqgxqmc,[🌟✂️ 5-Minute Crafts Historic Event! ✂️🌟\r\n🛒 ...
4,tyLhe5QnebM,[i love this video so much ❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤...
...,...,...
6312,RA9GjoRhcSE,"[Back When They Were Good. 😔😔😔, you can make a..."
6313,KtetRv4kfog,"[Marathon Watching 7/18/22 Cool Craft!, Nice.,..."
6314,AEp338eSyDM,"[Back when 5 min crafts were useful :'D, This ..."
6315,JmFQavE0sPM,"[Who's watching this video in 2023, When 5 min..."


In [11]:
# Write video data to CSV file for future references
video_df.to_csv('video_data.csv')
comments_df.to_csv('comments_data.csv')

## Preprocessing & Feature engineering

To be able to make use of the data for analysis, we need to perform a few pre-processing steps. Firstly, I would like reformat some columns, especially the date and time columns such as "pushlishedAt" and "duration". In addition, I also think it is necessary to enrich the data with some new features that might be useful for understanding the videos' characteristics.

### Check for empty values

In [12]:
video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description       False
tags               True
publishedAt       False
viewCount         False
likeCount         False
favouriteCount     True
commentCount       True
duration          False
definition        False
caption           False
dtype: bool

In [13]:
video_df.publishedAt.sort_values().value_counts()

2017-11-24T08:30:00Z    2
2019-08-17T12:30:00Z    2
2016-11-16T20:31:25Z    1
2020-08-01T00:30:03Z    1
2020-08-03T12:30:00Z    1
                       ..
2018-10-21T12:30:00Z    1
2018-10-21T07:30:00Z    1
2018-10-20T23:30:01Z    1
2018-10-20T18:30:01Z    1
2023-11-17T17:00:35Z    1
Name: publishedAt, Length: 6686, dtype: int64

Next, we need to check if the data type of the columns are correct. I have checked the data types and indeed some count columns such as view count and comment count are currently not in correct data type. In this step, we convert these count columns into integer.

In [14]:
cols = ['viewCount', 'likeCount', 'commentCount']
video_df[cols] = video_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

### Enriching data

I want to enrich the data for further analyses, for example:

- create published date column with another column showing the day in the week the video was published, which will be useful for later analysis.

- convert video duration to seconds instead of the current default string format

- calculate number of tags for each video

- calculate comments and likes per 1000 view ratio

- calculate title character length

In [15]:
# Create publish day (in the week) column
video_df['publishedAt'] =  video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A")) 

In [16]:
# convert duration to seconds
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

In [17]:
# Add number of tags
video_df['tagsCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

In [18]:
# Comments and likes per 1000 view ratio
video_df['likeRatio'] = video_df['likeCount']/ video_df['viewCount'] * 1000
video_df['commentRatio'] = video_df['commentCount']/ video_df['viewCount'] * 1000

In [19]:
# Title character length
video_df['titleLength'] = video_df['title'].apply(lambda x: len(x))

Let's look at the video dataset at this point to see if everything went well. It looks good - now we will proceed to exploratory analysis part.

In [20]:
df = video_df.drop('favouriteCount', axis=1)
df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,commentCount,duration,definition,caption,pushblishDayName,durationSecs,tagsCount,likeRatio,commentRatio,titleLength
0,_lpfFbYnbOM,5-Minute Crafts,Illuminate Your World with Rock Art! 🌟✨ #shorts,Get Your Rock Painting Kit Now at Walmart! ➡️ ...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-17 17:00:35+00:00,11513.0,638.0,4.0,PT31S,hd,False,Friday,31.0,20,55.415617,0.347433,47
1,NAW3bdt3xh8,5-Minute Crafts,Easy School Crafts for Creative Students 📚✂️,Hi friends! 🧩🍭 Welcome to our creative corner!...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-17 13:00:19+00:00,25981.0,309.0,25.0,PT1H49S,hd,False,Friday,3649.0,32,11.893307,0.962242,44
2,fgUr6kfnv9c,5-Minute Crafts,DIY Jewelry Ideas ✨💍Unique Accessories You Can...,Hi everyone! 🤩🤗 Welcome to a world of creativi...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-17 11:00:40+00:00,15873.0,212.0,13.0,PT8M59S,hd,False,Friday,539.0,32,13.356013,0.819001,59
3,B_x5rqgxqmc,5-Minute Crafts,🌟✂️ 5-Minute Crafts Historic Event! ✂️🌟 #short...,🌟✂️ 5-Minute Crafts Historic Event! ✂️🌟\r\n🛒 N...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-16 15:00:09+00:00,70549.0,1511.0,7.0,PT15S,hd,False,Thursday,15.0,20,21.417738,0.099222,80
4,tyLhe5QnebM,5-Minute Crafts,Delicious Snack and Treat Ideas That Are Quick...,Hi everyone! 💗🍭 Join us in this mouthwatering ...,"[5 minute craft, 5 minutes craft, 5-minute cra...",2023-11-16 13:00:48+00:00,48395.0,437.0,30.0,PT1H2S,hd,False,Thursday,3602.0,35,9.029858,0.619899,66


In [21]:
#updated csv
# Write video data to CSV file for future references
video_df.to_csv('video_data_UP.csv')