In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.colors as colors
import plotly.io as pio

In [3]:
dfgood = pd.read_csv('youtube_data.csv')
dfbob = pd.read_csv("youtube_bob_data.csv")
dfgm = pd.read_csv("youtube_gm_data.csv")

In [6]:
print(len(dfgood))
print(len(dfbob))
print(len(dfgm))
len(dfgood) + len(dfbob) + len(dfgm)

326
214
502


1042

In [8]:
#add a column to each dataframe to identify the source
dfgood['source'] = 'GG'
dfbob['source'] = 'BDS'
dfgm['source'] = 'GM'

In [10]:
#combine all the dataframes
df = pd.concat([dfgood, dfbob, dfgm], ignore_index=True)
df

Unnamed: 0,date,title,duration,views,likes,comments,source
0,2022-06-08 14:00:11+00:00,First Ever Outlaw Golf Challenge | Good Good,1889,615200,18865,1327,GG
1,2022-03-16 04:33:56+00:00,Good Good 1v1 Bracket Elimination Golf Challen...,1238,499863,17874,990,GG
2,2022-03-20 17:11:00+00:00,"Last Golfer To Make A Bogey Wins $1,000",1266,1561737,40287,1084,GG
3,2022-03-27 18:15:11+00:00,2v2v2 Hole In One Golf Challenge | Good Good,732,437085,15842,460,GG
4,2022-04-29 04:09:10+00:00,Who Wins The First Pursell Farms Classic?! | 3...,2021,489276,17824,971,GG
...,...,...,...,...,...,...,...
1037,2019-01-04 01:43:14+00:00,We Created a GOLF COURSE in MY HOUSE And This ...,1021,29022,510,113,GM
1038,2019-01-09 23:26:50+00:00,Barefoot Golf Challenge *Frigid Temperatures*,719,14786,408,147,GM
1039,2020-07-12 22:00:12+00:00,Shot Tracers On EVERY SHOT! | Sunday Match #27...,2010,316680,11268,1122,GM
1040,2018-10-17 01:08:26+00:00,Golf Ball on the Back Of the Club Challenge Wi...,700,7518,168,30,GM


In [12]:
#date column to datetime and order by date
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date')
df

Unnamed: 0,date,title,duration,views,likes,comments,source
1006,2015-06-14 16:36:44+00:00,Garrett's Round Of 72 (FULL 18),830,11120,192,18,GM
951,2016-12-28 01:57:04+00:00,Golf Trick Shot Compilation,228,6809,177,8,GM
952,2017-07-03 05:06:02+00:00,"Garrett's Round of -4, 68 (Bogey Free) Hurrica...",895,278710,2424,137,GM
1003,2018-01-28 20:47:56+00:00,PGA Show Demo Day Ground 0,747,2432,63,9,GM
691,2018-07-20 23:58:39+00:00,How To Juggle a Golf Ball,244,31837,808,21,GM
...,...,...,...,...,...,...,...
449,2023-03-23 17:00:38+00:00,The Most Ridiculous Round Of Golf In History,2020,436953,16778,1887,BDS
627,2023-03-23 20:41:46+00:00,I Challenged a High School Golf Team to A Matc...,1566,179296,6127,386,GM
547,2023-03-26 14:38:48+00:00,We Did NOT Expect The Match To Go This Way… Ga...,1489,198454,7332,1421,GM
407,2023-03-27 16:53:22+00:00,At least he’s consistent… #bobdoessports,18,35889,828,35,BDS


In [14]:
#add a new column that identifies what time of day the video was posted. Four groups: morning, afternoon, evening, night
# do this by using the date column which it the year, month, day, hour, minute, second

df['time_of_day'] = df['date'].dt.hour
df['time_of_day'] = df['time_of_day'].apply(lambda x: 'morning' if x < 12 else 'afternoon' if x < 17 else 'evening' if x < 20 else 'night')
df

Unnamed: 0,date,title,duration,views,likes,comments,source,time_of_day
1006,2015-06-14 16:36:44+00:00,Garrett's Round Of 72 (FULL 18),830,11120,192,18,GM,afternoon
951,2016-12-28 01:57:04+00:00,Golf Trick Shot Compilation,228,6809,177,8,GM,morning
952,2017-07-03 05:06:02+00:00,"Garrett's Round of -4, 68 (Bogey Free) Hurrica...",895,278710,2424,137,GM,morning
1003,2018-01-28 20:47:56+00:00,PGA Show Demo Day Ground 0,747,2432,63,9,GM,night
691,2018-07-20 23:58:39+00:00,How To Juggle a Golf Ball,244,31837,808,21,GM,night
...,...,...,...,...,...,...,...,...
449,2023-03-23 17:00:38+00:00,The Most Ridiculous Round Of Golf In History,2020,436953,16778,1887,BDS,evening
627,2023-03-23 20:41:46+00:00,I Challenged a High School Golf Team to A Matc...,1566,179296,6127,386,GM,night
547,2023-03-26 14:38:48+00:00,We Did NOT Expect The Match To Go This Way… Ga...,1489,198454,7332,1421,GM,afternoon
407,2023-03-27 16:53:22+00:00,At least he’s consistent… #bobdoessports,18,35889,828,35,BDS,afternoon


In [23]:
#make a plot to visualize the relationship between time of day and views
fig = px.box(df, x='time_of_day', y='views', color='source', points='all', height=1000, width=1500)
fig.show()

In [17]:
# make a plot to visualize the relationship between duration and views
fig = px.scatter(df, x='duration', y='views', color='source')
fig.show()

In [20]:
#make a bubble chart to show the growth of views over time
fig = px.scatter(df, x='date', y='views', color='source', size='views', size_max=60)
fig.show()
#do the same thing but use three different subplots, one for each source
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, subplot_titles=('Good Good Golf', 'Bob Does Sports', 'GM_Golf'))
fig.add_trace(go.Scatter(x=dfgood['date'], y=dfgood['views'], mode='markers', name='Good Game'), row=1, col=1)
fig.add_trace(go.Scatter(x=dfbob['date'], y=dfbob['views'], mode='markers', name='Bob'), row=2, col=1)
fig.add_trace(go.Scatter(x=dfgm['date'], y=dfgm['views'], mode='markers', name='Game Maker'), row=3, col=1)
fig.update_layout(height=800, width=800, title_text="Views over Time")

In [None]:
sns.heatmap(temp.select_dtypes(exclude='object').corr()).set_title('Correlation Plot')