In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.colors as colors
import plotly.io as pio


In [3]:
def save_figure(fig, filename):
    """Save the given plotly figure to the specified filename in the images directory."""
    filepath = f'/Users/jdubindaclub/Desktop/stat386/my386blog/assets/images/{filename}'
    pio.write_image(fig, filepath)

In [4]:
dfgood = pd.read_csv('youtube_data.csv')
dfbob = pd.read_csv("youtube_bob_data.csv")
dfgm = pd.read_csv("youtube_gm_data.csv")

# General Metrics

In [5]:
print(len(dfgood))
print(len(dfbob))
print(len(dfgm))

326
214
502


In [6]:
fig = go.Figure(data=[go.Bar(x=['GGG', 'BDS', 'GM'], y=[len(dfgood), len(dfbob), len(dfgm)])])
fig.update_traces(marker_color='black', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Number of Videos Posted per Channel')
fig.update_xaxes(title_text='Channel')
fig.update_yaxes(title_text='Number of Videos')
save_figure(fig, 'golf.eda1.png')
fig.show()

In [7]:
#filter each dataframe to only include videos that were posted in 2021
dfgood21 = dfgood[dfgood['date'].str.contains('2021')]
dfbob21 = dfbob[dfbob['date'].str.contains('2021')]
dfgm21 = dfgm[dfgm['date'].str.contains('2021')]

#plot the number of videos posted per channel in 2021
# fig = go.Figure(data=[go.Bar(
#     x=['GGG', 'BDS', 'GM'], 
#     y=[len(dfgood21), len(dfbob21), len(dfgm21)],
#     marker={'color': colors.DEFAULT_PLOTLY_COLORS[0]}
#     )])
fig = go.Figure(data=[go.Bar(
    x=['GGG', 'BDS', 'GM'], 
    y=[len(dfgood21), len(dfbob21), len(dfgm21)],
    marker={'color': colors.DEFAULT_PLOTLY_COLORS[0]}
    )])
fig.update_layout(title_text='Number of Videos Posted per Channel in 2021')
fig.update_xaxes(title_text='Channel')
fig.update_yaxes(title_text='Number of Videos')
save_figure(fig, 'golf.eda2.png')
fig.show()

# Over Time

In [8]:
#change date columns to date objects format: YYYY-MM-DD using datetime library, not pandas to_datetime
dfgood['date'] = pd.to_datetime(dfgood['date'])
dfbob['date'] = pd.to_datetime(dfbob['date'])
dfgm['date'] = pd.to_datetime(dfgm['date'])
#sort the dataframes by date
dfgood = dfgood.sort_values(by='date')
dfbob = dfbob.sort_values(by='date')
dfgm = dfgm.sort_values(by='date')
dfgm = dfgm[dfgm['date'] > '2020-08-01']
dfgood = dfgood.reset_index(drop=True)
dfbob = dfbob.reset_index(drop=True)
dfgm = dfgm.reset_index(drop=True)

In [9]:
#make a plotly line graph of the number of views per video over time
fig = go.Figure()
fig.add_trace(go.Scatter(x=dfgood['date'], y=dfgood['views'], mode='lines', name='GGG'))
fig.add_trace(go.Scatter(x=dfbob['date'], y=dfbob['views'], mode='lines', name='BDS'))
fig.add_trace(go.Scatter(x=dfgm['date'], y=dfgm['views'], mode='lines', name='GM'))
fig.update_layout(title_text='Number of Views per Video Over Time')
fig.update_xaxes(title_text='Date')
fig.update_yaxes(title_text='Number of Views')
#save_figure(fig, 'golf.eda3.png')
fig.show()

In [10]:
#import plotly.colors as colors
#make a plotly figure with 3 subplots. 1st subplot is the number of views per video over time for each channel
# 2nd subplot is the number of likes per video over time for each channel
# 3rd subplot is the number of comments per video over time for each channel
fig = go.Figure()
fig = make_subplots(rows=3, cols=1, subplot_titles=("Views per Video", "Likes per Video", "Comments per Video"), vertical_spacing=0.1)
#make the color of each channel be the same for each channel accross sub plots
color = [colors.DEFAULT_PLOTLY_COLORS[2], colors.DEFAULT_PLOTLY_COLORS[4], colors.DEFAULT_PLOTLY_COLORS[9]]
#add the traces for each channel to each subplot without using a loop
fig.add_trace(go.Scatter(x=dfgood['date'], y=dfgood['views'], mode='lines', name='GGG', line_color=color[0]), row=1, col=1)
fig.add_trace(go.Scatter(x=dfbob['date'], y=dfbob['views'], mode='lines', name='BDS', line_color=color[1]), row=1, col=1)
fig.add_trace(go.Scatter(x=dfgm['date'], y=dfgm['views'], mode='lines', name='GM', line_color=color[2]), row=1, col=1)
fig.add_trace(go.Scatter(x=dfgood['date'], y=dfgood['likes'], mode='lines', name='GGG', line_color=color[0]), row=2, col=1)
fig.add_trace(go.Scatter(x=dfbob['date'], y=dfbob['likes'], mode='lines', name='BDS', line_color=color[1]), row=2, col=1)
fig.add_trace(go.Scatter(x=dfgm['date'], y=dfgm['likes'], mode='lines', name='GM', line_color=color[2]), row=2, col=1)
fig.add_trace(go.Scatter(x=dfgood['date'], y=dfgood['comments'], mode='lines', name='GGG', line_color=color[0]), row=3, col=1)
fig.add_trace(go.Scatter(x=dfbob['date'], y=dfbob['comments'], mode='lines', name='BDS', line_color=color[1]), row=3, col=1)
fig.add_trace(go.Scatter(x=dfgm['date'], y=dfgm['comments'], mode='lines', name='GM', line_color=color[2]), row=3, col=1)
fig.update_layout(title_text='Metrics: Views, Likes, and Comments', height=1000, width=1000)
fig.update_xaxes(title_text='Date')
fig.update_traces(showlegend=False)
fig.update_traces(showlegend=True, row=1, col=1)
fig.update_yaxes(title_text='Number of Views', row=1, col=1)
fig.update_yaxes(title_text='Number of Likes', row=2, col=1)
fig.update_yaxes(title_text='Number of Comments', row=3, col=1)
save_figure(fig, 'golf.eda3.png')
fig.show()

In [11]:
# graph the rolling 7 day average of views for each channel
dfgood['rolling_views'] = dfgood['views'].rolling(7).mean()
dfbob['rolling_views'] = dfbob['views'].rolling(7).mean()
dfgm['rolling_views'] = dfgm['views'].rolling(7).mean()
# graph the rolling 7 day average of likes for each channel
dfgood['rolling_likes'] = dfgood['likes'].rolling(7).mean()
dfbob['rolling_likes'] = dfbob['likes'].rolling(7).mean()
dfgm['rolling_likes'] = dfgm['likes'].rolling(7).mean()
# graph the rolling 7 day average of comments for each channel
dfgood['rolling_comments'] = dfgood['comments'].rolling(7).mean()
dfbob['rolling_comments'] = dfbob['comments'].rolling(7).mean()
dfgm['rolling_comments'] = dfgm['comments'].rolling(7).mean()


#make a plotly line graph of the rolling 7 day average of views, likes, and comments per video over time
#use same method as previous plot, use 3 subplots
fig = go.Figure()
fig = make_subplots(rows=3, cols=1, subplot_titles=("Views", "Likes", "Comments"), vertical_spacing=0.1)
fig.add_trace(go.Scatter(x=dfgood['date'], y=dfgood['rolling_views'], mode='lines', name='GGG', line_color=color[0]), row=1, col=1)
fig.add_trace(go.Scatter(x=dfbob['date'], y=dfbob['rolling_views'], mode='lines', name='BDS', line_color=color[1]), row=1, col=1)
fig.add_trace(go.Scatter(x=dfgm['date'], y=dfgm['rolling_views'], mode='lines', name='GM', line_color=color[2]), row=1, col=1)
fig.add_trace(go.Scatter(x=dfgood['date'], y=dfgood['rolling_likes'], mode='lines', name='GGG', line_color=color[0]), row=2, col=1)
fig.add_trace(go.Scatter(x=dfbob['date'], y=dfbob['rolling_likes'], mode='lines', name='BDS', line_color=color[1]), row=2, col=1)
fig.add_trace(go.Scatter(x=dfgm['date'], y=dfgm['rolling_likes'], mode='lines', name='GM', line_color=color[2]), row=2, col=1)
fig.add_trace(go.Scatter(x=dfgood['date'], y=dfgood['rolling_comments'], mode='lines', name='GGG', line_color=color[0]), row=3, col=1)
fig.add_trace(go.Scatter(x=dfbob['date'], y=dfbob['rolling_comments'], mode='lines', name='BDS', line_color=color[1]), row=3, col=1)
fig.add_trace(go.Scatter(x=dfgm['date'], y=dfgm['rolling_comments'], mode='lines', name='GM', line_color=color[2]), row=3, col=1)
fig.update_layout(title_text='Rolling 7 Day Average of Metrics: Views, Likes, and Comments', height=1000, width=1000)
fig.update_xaxes(title_text='Date')
fig.update_traces(showlegend=False)
fig.update_traces(showlegend=True, row=1, col=1)
fig.update_yaxes(title_text='Number of Views', row=1, col=1)
fig.update_yaxes(title_text='Number of Likes', row=2, col=1)
fig.update_yaxes(title_text='Number of Comments', row=3, col=1)
save_figure(fig, 'golf.eda4.png')
fig.show()

# Correlation

In [17]:
#plot the correlation between views and duration of dfgood, make color be color[0]
cor_good = dfgood['duration'].corr(dfgood['views'])
fig = px.scatter(dfgood, x='duration', y='views')
fig.update_layout(title_text=f'Views vs. Duration: GGG {cor_good}', height=500, width=500)
fig.update_xaxes(title_text='Duration')
fig.update_yaxes(title_text='Number of Views')
save_figure(fig, 'golf.eda5.png')
fig.show()

In [19]:
#plot the correlation between views and duration of dfbob
cor_bob = dfbob['duration'].corr(dfbob['views'])
fig = px.scatter(dfbob, x='duration', y='views')
fig.update_layout(title_text=f'Views vs. Duration: BDS {cor_bob}', height=500, width=500)
fig.update_xaxes(title_text='Duration')
fig.update_yaxes(title_text='Number of Views')
save_figure(fig, 'golf.eda7.png')
fig.show()

In [18]:
#plot the correlation between views and duration of dfgm
cor_gm = dfgm['duration'].corr(dfgm['views'])
fig = px.scatter(dfgm, x='duration', y='views')
fig.update_layout(title_text=f'Views vs. Duration: GM {cor_gm}', height=500, width=500)
fig.update_xaxes(title_text='Duration')
fig.update_yaxes(title_text='Number of Views')
save_figure(fig, 'golf.eda6.png')
fig.show()