In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import plotly.express as px # Plotly
import plotly.graph_objects as go

import datetime
import json 
import pandas as pd 
from pandas.io.json import json_normalize #package for flattening json in pandas df

In [3]:
# Load Data
df = pd.read_csv('/kaggle/input/youtube-trending-video-dataset/CA_youtube_trending_data.csv')

In [4]:
# Number of rows and columns
df.shape

In [5]:
# Data types of each column and non-missing rows
df.info()

In [6]:
# Parse trending_date to datetime
df['dt_trending'] = pd.to_datetime(df['trending_date'])
df['dt_trending']

In [7]:
# Parse publish_time to datetime
df['dt_publish'] = pd.to_datetime(df['publishedAt'])
df['dt_publish']

In [8]:
# Add category titles from Json to DF
cats = pd.read_json('/kaggle/input/youtube-trending-video-dataset/US_category_id.json')
cat_map = pd.json_normalize(cats['items'])[['id', 'snippet.title']]
py_map = {}

def create_pymap(row):
    py_map[row['id']] = row['snippet.title']
    return row
cat_map.apply(create_pymap, axis=1)

# Category Mapping
print(py_map)

df['cat_titles'] = df.apply(lambda x: py_map[str(x['categoryId'])], axis=1)
df['cat_titles']

In [9]:
# Export DF for Tableau Visualizations
df.to_csv('canada_trending_yt.csv')

In [10]:
# Browse through data
df.head()

In [11]:
# Weekdays
new_df = df
new_df['weekday'] = df['dt_publish'].dt.weekday
new_df['hour'] = df['dt_publish'].dt.hour
heatmap_df = new_df[['weekday', 'hour']]

heatmap_df

In [12]:
# Determine Time of Day
def categ_hours(row):
    if row['hour'] >= 4 and row['hour'] < 12:
        return 'Morning'
    elif row['hour'] >= 12 and row['hour'] <= 20:
        return 'Evening'
    else:
        return 'Night'
    
heatmap_df['time_of_day'] = heatmap_df.apply(lambda x: categ_hours(x), axis=1)
heatmap_df['time_of_day']

In [13]:
# Transform heatmap_df to numpy data
series = heatmap_df.groupby(['weekday', 'time_of_day']).size()
data = np.zeros((3, 7))
for i in ["Morning", 'Evening', 'Night']:
    for j in range(7):
        if i == 'Morning':
            data[0][j] = series[j][i]
        elif i == 'Evening':
            data[1][j] = series[j][i]
        else:
            data[2][j] = series[j][i]
        
data

In [32]:
# Plot the heatmap
fig = px.imshow(data,
                labels=dict(x="Day of Week", y="Time of Day", color="Productivity"),
                x=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
                y=['Morning', 'Evening', 'Night'],
                title="Publish Date of Trending Videos in Canada"
               )
fig.update_xaxes(side="top")
fig.show()

### Publish Date Numerical Distribution

The heatmap shows that most videos were published in the Evening timeframe (Noon - 8pm) with most being published on a Tuesday or Sunday evening. Within the morning hours, most videos were published on Friday and the least on Thursday or Saturday morning. For the 'graveyard' hours, most trending videos were published Monday night. One could infer that publishing a video Tuesday evening and Sunday evening will give a creator the best chance to get a trending video.

In [15]:
# Distribution of View Count

fig = px.box(df, x="view_count", title="View Counts of Trending Videos in Canada", log_x=True)
fig.show()

### Distribution of View Counts of Trending Videos

The box plot shows that trending videos normally have between 500k and 2.3M views. Videos with over 5M views tend to be an exception to the rule. We can posit that creators need their video to gain at least 500k views before it appears on the Trending Videos page.

In [33]:
# Like, Dislike, Comment Box Plots
fig = go.Figure()

for col in df[["likes", "dislikes", "comment_count"]]:
    fig.add_trace(go.Box(y=df[col].values, name=df[col].name))
    
fig.update_yaxes(type="log")
fig.update_layout(
    title="Comparing User Engagement Across Trending Videos",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()

### User Engagement on Trending Videos

The numerical distributions show that trending videos gain more likes than comments or dislikes. The scale of each engagement method is very different. The median amount of likes is 52K while the median dislikes is 0.8K. Comments sit slightly above dislikes at 3.3K. These medians show that creators need high like and comment counts with low dislikes to become trending.

In [34]:
# Correlation Matrix of Numerical Columns
fig = px.scatter_matrix(df[["view_count", "likes", "dislikes", "comment_count"]])
fig.show()

### Correlation Matrix of Numerical Columns

The data generally shows a positive correlation. Some stronger correlations seem to exist between comment_count and likes. The view_count vs likes graph in the top row seems to suggest a lower bound of people viewing the video as the number of likes increase. Also in the top row, the view_count vs comment_count graph shows two characteristics. There seems to be one set of videos that gain high views with little to no comments as can be seen with the cluster of points along the y-axis. The second characteristic is the positive correlation of comments as the number of views increases.

In [18]:
# Load US + Great Britain (English speaking countries)
us_df = pd.read_csv('/kaggle/input/youtube-trending-video-dataset/US_youtube_trending_data.csv')
gb_df = pd.read_csv('/kaggle/input/youtube-trending-video-dataset/GB_youtube_trending_data.csv')

In [19]:
# Convert trending dates to datetime
us_df['dt_trending'] = pd.to_datetime(us_df['trending_date'])
gb_df['dt_trending'] = pd.to_datetime(gb_df['trending_date'])

In [20]:
# Remove duplicate videos that appear twice in one day
df_days1 = df.groupby([df['dt_trending'].dt.date, 'video_id']).agg({'view_count': 'max'}).reset_index()
df_days2 = us_df.groupby([us_df['dt_trending'].dt.date, 'video_id']).agg({'view_count': 'max'}).reset_index()
df_days3 = gb_df.groupby([gb_df['dt_trending'].dt.date, 'video_id']).agg({'view_count': 'max'}).reset_index()
df_days1

In [21]:
# View count line graph per week
tmp1 = df_days1.groupby([df_days1['dt_trending']]).agg({'view_count': 'sum'})
tmp2 = df_days2.groupby([df_days2['dt_trending']]).agg({'view_count': 'sum'})
tmp3 = df_days3.groupby([df_days3['dt_trending']]).agg({'view_count': 'sum'})

comb_df = pd.concat([tmp1, tmp2, tmp3], keys=["Canada", "United States", "Great Britain"]).reset_index()
comb_df.rename(columns={'level_0': 'country'}, inplace=True)
comb_df

In [22]:
fig = px.line(comb_df, x="dt_trending", y="view_count", color='country', title="Trending Videos View Count by Country")
fig.show()

### Trending Videos View Count for English-speaking Countries

The US seems to dominate views on trending videos until sometime in March where Canada starts to become the dominant source. Great Britain, for the most part, sees less views than either country. Also, Great Britain seems more closely synced with Canadian viewers than US view as can be seen in the Sep. 16th spike and July 14th / July 15th spikes. US viewers seem to have different tastes than Canadian + Great Britain viewers. Canadians also have the largest viewership spike on any given day at 1.19B views on July 4th. 

In [23]:
# Top 5 Videos July 4th in Canada
top5_df = df[(df['dt_trending'].dt.date == datetime.date(2021, 7, 4))] \
              .sort_values('view_count', ascending=False) \
              .drop_duplicates(subset='video_id') \
              .head(5)
top5_df

In [24]:
# The top trending video on July 4th in Canada, the day with the most views out of the three countries
from IPython.display import YouTubeVideo

YouTubeVideo(top5_df['video_id'].iloc[0]) # so dumb :(

All I can say is that this video somehow broke the Canadian YT viewership. It's kinda funny but mostly dumb. Humor cannot be explained :(

In [25]:
# Scatterplot PCA'd of view count, likes, dislikes, comment_count; cat_titles as color
from sklearn.preprocessing import StandardScaler

features = ['view_count', 'likes', 'dislikes', 'comment_count']
# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['cat_titles']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
x

In [26]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf

In [27]:
finalDf = pd.concat([principalDf, df[['cat_titles']]], axis = 1)
finalDf

In [28]:
fig = px.scatter(finalDf, x="principal component 1", y="principal component 2", color="cat_titles", title="Video Categories by View Count, Likes, Dislikes, and Comment Count")
fig.show()

### PCA of Categories from User Engagement

The PCA attempts to show what a 4D graph of user engagement points would look like in 2D. If we were to try and predict a given trending video to a category, we could probably do it for the some of the Music category, but most of the other categories are overlapping. This shows that there isn't enough information to "accurately" categorize a given trending video to a category based on user engagement alone.

In [29]:
# Choose a subset of categories to remove
cat_to_remove = ["Music"]

selectedDf = finalDf[~finalDf['cat_titles'].isin(cat_to_remove)]
selectedDf

In [30]:
fig = px.scatter(selectedDf, x="principal component 1", y="principal component 2", color="cat_titles", title='Video Categories (excluding Music) by View Count, Likes, Dislikes, and Comment Count')
fig.show()

### PCA of User Engagement w/o the Music Category

By removing the Music category, we get a closer look at the concentration of the other categories. As we posited above, there doesn't seem to be a clear separation of categories from user engagement alone to attempt building a classifier that automatically categorized trending videos by category. 

In [31]:
'Amount of information retained after PCA: %.2f%%' % (pca.explained_variance_ratio_.sum()*100)

An 89% retention of information shows that the PCA was able to keep 89% of the original variance present in the original 4D graph. With the amount of overlap, it seems doubtful that the 11% lost could help in separating the categories. 