In [2]:
import pandas as pd
import os
from googleapiclient.discovery import build
import isodate
from sqlalchemy import create_engine,text
import plotly.express as px

# Import Data

In [5]:
db_string = 'sqlite:///../db/youtube.db'

# Create a engine
engine = create_engine(db_string)
# Create connection
conn = engine.connect()

In [6]:
# Import channels data
query = text('SELECT * FROM channel')
channels_df = pd.read_sql_query(query, conn)
channels_df.head()

Unnamed: 0,channel_id,channel_name,description,subscriber_count,view_count,video_count,playlist_id,start_date,country
0,UC8butISFwT-Wl7EV0hUK0BQ,freeCodeCamp.org,Learn to code for free.,9040000,661570512,1585,UU8butISFwT-Wl7EV0hUK0BQ,2014-12-16,US
1,UCEBpSZhI1X8WaP-kY_2LLcg,365 Data Science,At 365 Data Science we make #DataScience acces...,307000,13782817,224,UUEBpSZhI1X8WaP-kY_2LLcg,2017-08-07,BG
2,UCHXa4OpASJEwrHrLeIzw7Yg,Nicholas Renotte,"Sup!\n\nWelcome to the channel. So, if you're ...",235000,15263747,304,UUHXa4OpASJEwrHrLeIzw7Yg,2019-01-26,AU
3,UCDybamfye5An6p-j1t2YMsg,Mo Chen,"👋 Hey there, my name is Mo Chen and I work as ...",82100,2658864,88,UUDybamfye5An6p-j1t2YMsg,2022-12-25,GB
4,UCkRFwipiIqBTakN-mkZ-GcQ,Ayush Singh,,70200,2075280,24,UUkRFwipiIqBTakN-mkZ-GcQ,2022-06-25,IN


In [7]:
# Import videos data
query = text('SELECT * FROM video')
videos_df = pd.read_sql_query(query, conn)
videos_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id
0,UC8butISFwT-Wl7EV0hUK0BQ,YdWkUdMxMvM,Career Change to Code - The Complete Guide,This course is for those considering transitio...,,2024-02-07 15:49:07.000000,3252.0,374.0,0,27.0,12191,hd,False,27
1,UC8butISFwT-Wl7EV0hUK0BQ,5rNk7m_zlAg,Spring Boot & Spring Data JPA – Complete Course,Learn how to use Spring Boot and Spring Data J...,,2024-02-06 15:25:40.000000,24118.0,1434.0,0,223.0,45737,hd,False,27
2,UC8butISFwT-Wl7EV0hUK0BQ,5ZdHfJVAY-s,Build 25 React Projects – Tutorial,Master React by building 25 different projects...,,2024-02-05 15:30:28.000000,50388.0,2988.0,0,103.0,34614,hd,False,27
3,UC8butISFwT-Wl7EV0hUK0BQ,OwjKN9_NqPI,Oh My Zsh Creator Robby Russell – freeCodeCamp...,"In this week's episode of the podcast, freeCod...",,2024-02-02 15:26:29.000000,14435.0,284.0,0,22.0,7673,hd,False,27
4,UC8butISFwT-Wl7EV0hUK0BQ,e2nkq3h1P68,Learn Accessibility - Full a11y Tutorial,Learn how to write accessible HTML by solving ...,,2024-02-01 15:38:37.000000,23669.0,968.0,0,17.0,5586,hd,False,27


# Data Pre-Processing
## Remove shorts and longs

In [8]:
videos_df_filtered = videos_df[(videos_df['duration'] >= 60) & (videos_df['duration'] <= 1800)]

# Analysis

## Engagement Metrics
### Questions:
- What are the key engagement metrics that correlate with higher view counts? 
- Does increase comments per view lead to more views? 
- Do Metrics such as likes, comments, shares, and watch lead to increased views?
- Aggregated at a channel level, does increased engagement lead to more subscriptions?

### Steps:
**Identifying Key Engagement Metrics Correlating with Higher View Counts:**
- Visualize the relationships between engagement metrics and view counts using scatter plots, heatmaps, or pair plots to gain insights into how they correlate.
- Explore the correlation between each engagement metric (e.g., like count, comment count, favorite count, etc.) and the view count. You can calculate correlation coefficients (e.g., Pearson correlation) to measure the strength and direction of the relationship.

**Investigating if Increased Engagement Metric per View Lead to More Views:**
- Visualize the relationships between engagement metrics per view and view counts using scatter plots, heatmaps, or pair plots to gain insights into how they correlate.
- Explore the correlation between each engagement metric (e.g., like count, comment count, favorite count, etc.) and the view count. You can calculate correlation coefficients (e.g., Pearson correlation) to measure the strength and direction of the relationship.
- Additionally, you can perform feature importance analysis using techniques like Random Forest or Gradient Boosting to identify which engagement metrics have the most significant impact on view counts.
- Consider using regression analysis to model the relationship between comments per view and view count, controlling for other factors if necessary.
- Consider incorporating interaction terms or polynomial features to capture potential nonlinear relationships between metrics and view counts.

**Investigating if at a channel level, Increased Engagement Metric per Video Leads to More Subs:**
- Aggregate the metrics (likes, comments, shares, watch time) for each channel, (maybe average per video) and investigate impact on sub count
- Visualize the relationships between engagement metrics and view counts using scatter plots, heatmaps, or pair plots to gain insights into how they correlate.
- Perform a multivariate analysis to investigate the combined impact of these metrics on subscriber counts. You can use techniques like multiple linear regression or generalized linear models.

In [9]:
channels_df.head(1)

Unnamed: 0,channel_id,channel_name,description,subscriber_count,view_count,video_count,playlist_id,start_date,country
0,UC8butISFwT-Wl7EV0hUK0BQ,freeCodeCamp.org,Learn to code for free.,9040000,661570512,1585,UU8butISFwT-Wl7EV0hUK0BQ,2014-12-16,US


In [10]:
videos_df_filtered.head(1)

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id
57,UC8butISFwT-Wl7EV0hUK0BQ,9He4UBLyk8Y,Front End Developer Roadmap 2024,Learn what technologies you should learn first...,,2023-10-19 14:18:42.000000,507722.0,17091.0,0,493.0,729,hd,False,27


#### Visualize the relationships between engagement metrics and view counts using scatter plots, heatmaps, or pair plots to gain insights into how they correlate.

In [11]:
# Scatter plot using Plotly
fig = px.scatter(videos_df_filtered, x='like_count', y='view_count', 
                 title='Relationship between Like Count and View Count',
                 labels={'like_count': 'Like Count', 'view_count': 'View Count'})
fig.show()


In [12]:
# Scatter plot using Plotly
fig = px.scatter(videos_df_filtered, x='comment_count', y='view_count', 
                 title='Relationship between Like Count and View Count',
                 labels={'like_count': 'Like Count', 'view_count': 'View Count'})
fig.show()
