In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [2]:
channels = pd.read_csv('channels_data_2023-03-17.csv')
videos = pd.read_csv('video_data_2023-03-17.csv')

In [3]:
channels.head()

Unnamed: 0,channel_id,title,description,published_at,uploads_id,view_count,subscriber_count,video_count
0,UCkyBtxQh0H5yOU5RjtHNr-w,Vania Ice,Vania Ice is a Burundian Female Artist Singer/...,2018-12-01 06:09:35+00:00,UUkyBtxQh0H5yOU5RjtHNr-w,425724,13200,8
1,UCMrF7OdgLb18wjk8U-wB5sg,Trey Zo & Rappy Boy,#dudeclickmadeboom\nTwo blood brothers doing m...,2016-02-19 23:54:25+00:00,UUMrF7OdgLb18wjk8U-wB5sg,2755721,36900,8
2,UCRuEBttZI9Q3oWwyj9F0byg,El Pro Burundi,,2015-11-03 09:57:14+00:00,UURuEBttZI9Q3oWwyj9F0byg,781401,25100,18
3,UCqFJ9aC_2uHG6c9YRnAW5zA,Big Fizzo Official,Welcome to the official YouTube channel of Big...,2016-04-08 20:27:08+00:00,UUqFJ9aC_2uHG6c9YRnAW5zA,6971548,112000,60
4,UChFNdXDc35jvpqY_BZUVpwg,Thizzy official,Artiste/chanteur,2016-02-22 15:52:06+00:00,UUhFNdXDc35jvpqY_BZUVpwg,91726,3260,15


# Questions this data answers

In [24]:
# Question 1: What is the average number of views per channel?
avg_views_per_channel = channels['view_count'].mean()
print(f"The average number of views per channel: {avg_views_per_channel}")
# Question 2: What is the average number of subscribers per channel?
avg_subscribers_per_channel = channels['subscriber_count'].mean()
print(f"The average number of subscribers per channel: {avg_subscribers_per_channel}")
# Question 3: Which channel has the highest number of subscribers?
max_subscribers_channel = channels.loc[channels['subscriber_count'].idxmax(), 'title']
print(f"The channel with the highest number of subscribers is: {max_subscribers_channel}")
# Question 4: Which channel has the highest number of views?
max_views_channel = channels.loc[channels['view_count'].idxmax(), 'title']
print(f"The channel with the highest number of views is: {max_views_channel}")
# Question 5: How many channels have less than 10,000 subscribers?
num_channels_less_than_10k = (channels['subscriber_count'] < 10000).sum()
print(f"The number of channels with less than 10,000 subscribers is: {num_channels_less_than_10k}")
# Question 6: What is the average video count per channel?
avg_video_count_per_channel = channels['video_count'].mean()
print(f"The average video count per channel is: {avg_video_count_per_channel}")
# Question 7: What is the total number of views for all channels combined?
total_views = channels['view_count'].sum()
print(f"The total number of views for all channels combined is: {total_views}")
# Question 8: Is there a correlation between the number of subscribers and the number of views?
corr = channels['subscriber_count'].corr(channels['view_count'])
print(f"The correlation between subscriber count and view count is: {corr}")
# Question 9: Is there a relationship between the age of a channel (based on "published_at") and the number of subscribers or views?
# Calculate the age of the channel in years
import datetime
now = pd.Timestamp.now(tz='UTC')
channels['published_at'] = pd.to_datetime(channels['published_at']).dt.tz_localize(None).dt.tz_localize('UTC')
ages = pd.DataFrame()
ages['age'] = (now - channels['published_at']).dt.days / 365.25

# Calculate the correlation between age and subscriber count
corr_sub = ages['age'].corr(channels['subscriber_count'])
print(f"The correlation between age and subscriber count is: {corr_sub}")

# Calculate the correlation between age and view count
corr_view = ages['age'].corr(channels['view_count'])
print(f"The correlation between age and view count is: {corr_view}")

The average number of views per channel: 3029300.066666667
The average number of subscribers per channel: 47031.0
The channel with the highest number of subscribers is: Sat-B
The channel with the highest number of views is: Sat-B
The number of channels with less than 10,000 subscribers is: 4
The average video count per channel is: 37.266666666666666
The total number of views for all channels combined is: 90879002
The correlation between subscriber count and view count is: 0.9703261979698973
The correlation between age and subscriber count is: 0.3138557226457533
The correlation between age and view count is: 0.35493297332893164


## Visualization



### Compare the number of views, subscribers, and videos for each channel.

In [32]:
import plotly.graph_objs as go

# Sort the data by the selected column
df_views = channels.sort_values(by='view_count', ascending=False)
df_subs = channels.sort_values(by='subscriber_count', ascending=False)
df_videos = channels.sort_values(by='video_count', ascending=False)

# Create data for the bar plots
views_data = go.Bar(x=df_views['title'], y=df_views['view_count'], name='Views')
subscribers_data = go.Bar(x=df_subs['title'], y=df_subs['subscriber_count'], name='Subscribers')
videos_data = go.Bar(x=df_videos['title'], y=df_videos['video_count'], name='Videos')

# Create layout for the plots
layout = go.Layout(title='Comparison of Views, Subscribers, and Videos per Channel')

# Create figure and add subplots for each bar plot
fig = go.Figure(data=[views_data, subscribers_data, videos_data], layout=layout)
fig.update_layout(barmode='group')
fig.show()

In [35]:
# The relationship between the views, subs, videos
import plotly.express as px

fig = px.scatter(channels, x="view_count", y="subscriber_count",
                 size="video_count", hover_data=['title'])

fig.update_layout(title="Relationship between Views and Subscribers",
                  xaxis_title="View Count",
                  yaxis_title="Subscriber Count")
fig.show()

In [38]:
fig = px.scatter(channels, x='video_count', y='view_count', hover_data=['title'],
                 title='Relationship Between Video Count and View Count')
fig.update_layout(title="Relationship between Views and Video Counts",
                  xaxis_title="Video Count",
                  yaxis_title="View Count")

fig.show()

In [49]:
fig = px.histogram(channels, x="view_count", nbins=20, title="Distribution of Views")
fig.show()

fig = px.histogram(channels, x="video_count", nbins=20, title="Distribution of Videos")
fig.show()

fig = px.histogram(channels, x="subscriber_count", nbins=20, title="Distribution of Subscribers")
fig.show()

In [48]:
ig = px.histogram(channels, x="view_count", nbins=20, title="Distribution of Views")
fig.show()


In [50]:
# Sort the data by the published date
df_sorted = channels.sort_values(by='published_at')

# Create data for the line plots
views_data = go.Scatter(x=df_sorted['published_at'], y=df_sorted['view_count'], name='Views', mode='lines')
subscribers_data = go.Scatter(x=df_sorted['published_at'], y=df_sorted['subscriber_count'], name='Subscribers', mode='lines')
videos_data = go.Scatter(x=df_sorted['published_at'], y=df_sorted['video_count'], name='Videos', mode='lines')

# Create layout for the plot
layout = go.Layout(title='Trend of Views, Subscribers, and Videos Over Time', xaxis_title='Published Date', yaxis_title='Count')

# Create figure and add subplots for each line plot
fig = go.Figure(data=[views_data, subscribers_data, videos_data], layout=layout)
fig.show()

In [51]:
# Create the heatmap data
heatmap_data = go.Heatmap(
    z=channels.corr(),
    x=channels.columns,
    y=channels.columns,
    colorscale='RdBu',
)

# Set the layout for the heatmap
layout = go.Layout(
    title='Correlation Heatmap',
)

# Create the figure
fig = go.Figure(data=[heatmap_data], layout=layout)

# Show the figure
fig.show()





In [53]:
print(videos.sample(n=20))

         video_id                                              title  \
748   6pahH5844bs  Mt number one - Coronavirus (Official Audio) f...   
874   UmaQOy21hvc  Sat-B - Joto Feat. Miss Erica & Lacia [IWACU] ...   
1065  -SagpyQ5UeE                 FLOW TIME with RED BHUL || S01-E06   
835   2zwoIURkoy0  Sat-B - Baby Girl ft Herbert Skillz (Official ...   
1138  9BReTYOTTVI                  #FASHA_AVURWE - Francisca UWINEZA   
593   oVmkyzptKv0         Vichou Love - Ur uwambere (Official Audio)   
689   Tx3zICOYVFU                                         Olga 🇷🇼🇷🇼🔥   
349   KacoVvCULCM                          Aba niga2 n'1baby S2 Ep 5   
84    jVKtFNCtEKg                                 Ni wewe ( Teaser )   
1091  OZdOl_HUAeU         B Face Performance at OctoberFest | Part 1   
653   TadwhT-8ZdM          Vichou Peace&Love - Mawe (Official Video)   
876   AQVG0eAsXNE  Sat-B - Tanganyika Girl Feat. Magic Washington...   
1030  JMRg37jBK64     B Face Live Performance at Jamaa Festival 