In [13]:
# dependencies
import chardet
import pandas as pd
import os 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# API
import requests
import json

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, text, inspect, func


In [14]:
# Load in file
# Store filepath in a variable
YOUTUBE_PATH = "Resources/global_youtube_stats.csv"

In [15]:
with open(YOUTUBE_PATH, 'rb') as csv_file:
    raw_content = csv_file.read()
    chardet_result = chardet.detect(raw_content)
print(chardet_result)

{'encoding': 'ISO-8859-1', 'confidence': 0.7289153199558486, 'language': ''}


In [16]:
# Read and display the CSV with Pandas
df = pd.read_csv(YOUTUBE_PATH, encoding=chardet_result['encoding'])
df.head()

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,subscribers_for_last_30_days,created_year,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude
0,1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
1,2,YouTube Movies,170000000,0.0,Film & Animation,youtubemovies,1,United States,US,Games,...,,2006.0,Mar,5.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
2,3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
4,5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 28 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   rank                                     995 non-null    int64  
 1   Youtuber                                 995 non-null    object 
 2   subscribers                              995 non-null    int64  
 3   video views                              995 non-null    float64
 4   category                                 949 non-null    object 
 5   Title                                    995 non-null    object 
 6   uploads                                  995 non-null    int64  
 7   Country                                  873 non-null    object 
 8   Abbreviation                             873 non-null    object 
 9   channel_type                             965 non-null    object 
 10  video_views_rank                         994 non-n

In [6]:
df.dropna(inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 554 entries, 0 to 994
Data columns (total 28 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   rank                                     554 non-null    int64  
 1   Youtuber                                 554 non-null    object 
 2   subscribers                              554 non-null    int64  
 3   video views                              554 non-null    float64
 4   category                                 554 non-null    object 
 5   Title                                    554 non-null    object 
 6   uploads                                  554 non-null    int64  
 7   Country                                  554 non-null    object 
 8   Abbreviation                             554 non-null    object 
 9   channel_type                             554 non-null    object 
 10  video_views_rank                         554 non-null  

In [17]:
# pd.set_option('display.max_rows', None)  # Set the option to display all rows
#print(df)

In [9]:
# Assuming your DataFrame is named df
# Rename specific columns
cleaned_df = df.rename(columns={
    'Gross tertiary education enrollment (%)': 'gross education enrollment percentage',
    'Youtuber': 'youtuber',
    'Title': 'title',
    'Abbreviation': 'abbreviation',
    'Country' : 'country',
    'Population': 'population',
    'Unemployment rate': 'unemployment rate',
    'Urban_population' : 'urban_population',
    'Latitude' : 'latitude',
    'Longitude' : 'longitude',
    'video views' : 'video_views'
    })

# Display the DataFrame with renamed columns
cleaned_df.head()

Unnamed: 0,rank,youtuber,subscribers,video_views,category,title,uploads,country,abbreviation,channel_type,...,subscribers_for_last_30_days,created_year,created_month,created_date,gross education enrollment percentage,population,unemployment rate,urban_population,latitude,longitude
0,1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
2,3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
4,5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
8,9,Like Nastya,106000000,90479060000.0,People & Blogs,Like Nastya Vlog,493,Russia,RU,People,...,100000.0,2016.0,Jan,14.0,81.9,144373500.0,4.59,107683889.0,61.52401,105.318756


In [18]:
# Specify the URI for the cleaned database
db_uri = 'sqlite:///data.db'

# Create an engine using the specified URI
engine = create_engine(db_uri)

# Write the cleaned DataFrame to a SQLite database table named 'my_table'
cleaned_df.to_sql('my_table', engine, if_exists='replace', index=False)

# Confirm that the data has been successfully written to the database
df_from_db = pd.read_sql('my_table', engine)
print(df_from_db)

     rank                    youtuber  subscribers   video_views  \
0       1                    T-Series    245000000  2.280000e+11   
1       3                     MrBeast    166000000  2.836884e+10   
2       4  Cocomelon - Nursery Rhymes    162000000  1.640000e+11   
3       5                   SET India    159000000  1.480000e+11   
4       9                 Like Nastya    106000000  9.047906e+10   
..    ...                         ...          ...           ...   
549   990                   Migos ATL     12400000  6.993406e+09   
550   991               Natan por Aï¿     12300000  9.029610e+09   
551   992    Free Fire India Official     12300000  1.674410e+09   
552   994                 RobTopGames     12300000  3.741235e+08   
553   995                Make Joke Of     12300000  2.129774e+09   

           category                       title  uploads        country  \
0             Music                    T-Series    20082          India   
1     Entertainment              

In [19]:
# INSPECT to confirm existence
# Create the inspector and connect it to the engine
inspector = inspect(engine)

# Collect the names of tables within the database
tables = inspector.get_table_names()

print(tables)

['my_table']


In [42]:
query = """ SELECT rank, Youtuber, subscribers
FROM my_table
ORDER BY subscribers DESC
LIMIT 25 """

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,rank,youtuber,subscribers
0,1,T-Series,245000000
1,3,MrBeast,166000000
2,4,Cocomelon - Nursery Rhymes,162000000
3,5,SET India,159000000
4,9,Like Nastya,106000000
5,10,Vlad and Niki,98900000
6,11,Zee Music Company,96700000
7,12,WWE,96000000
8,14,BLACKPINK,89800000
9,16,Sony SAB,83000000


In [43]:
#WHAT ARE THE TOP 10/25 YOUTUBE CHANNELS BY THE NUMBER OF SUBSCRIBERS AND VIDEO VIEWS
# 1A Define the SQL query top channels by subscribers
query = """
SELECT  Subscribers, country, channel_type
FROM my_table
GROUP BY channel_type
ORDER BY Subscribers DESC
LIMIT 25
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,subscribers,country,channel_type
0,245000000,India,Music
1,166000000,United States,Entertainment
2,162000000,United States,Education
3,106000000,Russia,People
4,96000000,United States,Sports
5,59500000,United States,Film
6,57600000,India,News
7,48100000,Chile,Games
8,43500000,Colombia,Howto
9,39200000,India,Comedy


In [44]:
# 1B Define the SQL query top channels by views
query = """
SELECT Rank, Youtuber, video_views, subscribers, country, channel_type
FROM my_table
GROUP BY channel_type
ORDER BY video_views DESC
LIMIT 25
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,rank,youtuber,video_views,subscribers,country,channel_type
0,1,T-Series,228000000000.0,245000000,India,Music
1,4,Cocomelon - Nursery Rhymes,164000000000.0,162000000,United States,Education
2,9,Like Nastya,90479060000.0,106000000,Russia,People
3,12,WWE,77428470000.0,96000000,United States,Sports
4,30,Movieclips,59316470000.0,59500000,United States,Film
5,3,MrBeast,28368840000.0,166000000,United States,Entertainment
6,64,Shakira,27568760000.0,43500000,Colombia,Howto
7,34,Aaj Tak,25307750000.0,57600000,India,News
8,276,That Little Puff,20289690000.0,23700000,United States,Animals
9,45,JuegaGerman,14631710000.0,48100000,Chile,Games


In [45]:
#WHAT ARE THE TOP 10/25 YOUTUBE CHANNELS BY COUNTRY


# 2 Define the SQL query top channels by country
query = """
SELECT Rank, Youtuber, subscribers, video_views, country, channel_type, category
FROM my_table
GROUP BY country
ORDER BY channel_type DESC
LIMIT 25
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,rank,youtuber,subscribers,video_views,country,channel_type,category
0,424,FIFA,19400000,5529132000.0,Switzerland,Sports,Sports
1,479,How Ridiculous,18000000,9601137000.0,Australia,Sports,Sports
2,77,shfa2 - ï¿½ï¿½,39700000,23884820000.0,United Arab Emirates,People,People & Blogs
3,9,Like Nastya,106000000,90479060000.0,Russia,People,People & Blogs
4,72,Kimberly Loaiza,41300000,5603112000.0,Mexico,People,People & Blogs
5,51,A4,46300000,22936630000.0,Cuba,People,Gaming
6,40,Ed Sheeran,53500000,30367680000.0,United Kingdom,Music,Music
7,260,netd mï¿½ï¿,24100000,56106090000.0,Turkey,Music,Music
8,14,BLACKPINK,89800000,32144600000.0,South Korea,Music,People & Blogs
9,900,DJ Khaled,13100000,6637821000.0,Samoa,Music,Music


In [46]:
#3 WHAT ARE THE TOP 10/25 YOUTUBE CHANNELS BY THE NUMBER OF VIEWS


query = """SELECT Youtuber, video_views
FROM my_table
ORDER BY video_views DESC
LIMIT 10"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)


Unnamed: 0,youtuber,video_views
0,T-Series,228000000000.0
1,Cocomelon - Nursery Rhymes,164000000000.0
2,SET India,148000000000.0
3,Sony SAB,101000000000.0
4,Like Nastya,90479060000.0
5,WWE,77428470000.0
6,Vlad and Niki,77180170000.0
7,Zee TV,73139050000.0
8,Colors TV,61510910000.0
9,Movieclips,59316470000.0


In [47]:
# 4WHAT CATEGORIES WILL GET THE MOST VIEWS
# Define the SQL query top channels by country
query = """
SELECT rank, youtuber, subscribers, channel_type, category, country, SUM(video_views) AS total_views
FROM my_table
GROUP BY category
ORDER BY video_views DESC
LIMIT 25
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,rank,youtuber,subscribers,channel_type,category,country,total_views
0,1,T-Series,245000000,Music,Music,India,1937407000000.0
1,4,Cocomelon - Nursery Rhymes,162000000,Education,Education,United States,610715500000.0
2,5,SET India,159000000,Entertainment,Shows,India,437905100000.0
3,9,Like Nastya,106000000,People,People & Blogs,Russia,763444700000.0
4,12,WWE,96000000,Sports,Sports,United States,132080900000.0
5,30,Movieclips,59500000,Film,Film & Animation,United States,389996100000.0
6,3,MrBeast,166000000,Entertainment,Entertainment,United States,1813469000000.0
7,34,Aaj Tak,57600000,News,News & Politics,India,212956500000.0
8,276,That Little Puff,23700000,Animals,Pets & Animals,United States,34902630000.0
9,71,Ishtar Music,41400000,Music,Trailers,India,33262720000.0


In [50]:
# 5 WHICH CHANNELS HAVE THE HIGHEST TOTAL VIDEO VIEWS?
# Define the SQL query top channels by country
query = """
SELECT channel_type, SUM(video_views) AS total_views
FROM my_table
GROUP BY Youtuber
ORDER BY total_views DESC;

"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,channel_type,total_views
0,Music,228000000000.0
1,Education,164000000000.0
2,Entertainment,148000000000.0
3,Entertainment,101000000000.0
4,People,90479060000.0
5,Sports,77428470000.0
6,Entertainment,77180170000.0
7,Entertainment,73139050000.0
8,Entertainment,61510910000.0
9,Film,59316470000.0


In [60]:

# 6WHAT ARE THE TOP 10/25 YOUTUBE CHANNELS BY UPLOAD

query = """
SELECT Youtuber, uploads
FROM my_table
ORDER BY uploads DESC
""";

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(100)

Unnamed: 0,youtuber,uploads
0,ABP NEWS,301308
1,GMA Integrated News,296272
2,TV9 Bharatvarsh,293516
3,Aaj Tak,283775
4,IndiaTV,273255
...,...,...
95,Codiscos,7356
96,Prime Video India,7090
97,Just For Laughs Gags,6916
98,ýýýýýýTwinsFromRussia,6888


In [39]:
#How do channels in different categories compare in terms of average views and earnings, by country?

query = """ SELECT 
    rank,
    Youtuber,
    lowest_monthly_earnings,
    highest_monthly_earnings,
    lowest_yearly_earnings,
    highest_yearly_earnings
FROM my_table
WHERE rank <= 25
ORDER BY highest_yearly_earnings DESC"""
    
# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)How do channels in different categories
compare in terms of average views and
earnings, by country?

Unnamed: 0,rank,youtuber,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings
0,1,T-Series,564600.0,9000000.0,6800000.0,108400000.0
1,4,Cocomelon - Nursery Rhymes,493800.0,7900000.0,5900000.0,94800000.0
2,5,SET India,455900.0,7300000.0,5500000.0,87500000.0
3,22,Zee TV,426800.0,6800000.0,5100000.0,81900000.0
4,16,Sony SAB,414300.0,6600000.0,5000000.0,79600000.0
5,3,MrBeast,337000.0,5400000.0,4000000.0,64700000.0
6,11,Zee Music Company,200900.0,3200000.0,2400000.0,38600000.0
7,12,WWE,178700.0,2900000.0,2100000.0,34300000.0
8,21,HYBE LABELS,149500.0,2400000.0,1800000.0,28700000.0
9,10,Vlad and Niki,145100.0,2300000.0,1700000.0,27900000.0


In [63]:
# Which category has the most channels in the top 100 by subscribers?

query = """ SELECT category, COUNT(*) AS Channel_Count
FROM my_table
WHERE rank <= 100
GROUP BY category
ORDER BY Channel_Count DESC
LIMIT 10"""
    
# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,category,Channel_Count
0,Music,24
1,Entertainment,15
2,People & Blogs,9
3,Education,7
4,Shows,5
5,Gaming,4
6,Film & Animation,3
7,Trailers,2
8,Sports,2
9,Comedy,2


In [65]:
#How do YouTube channels from different countries compare in terms of average subscribers and views?
query = """ SELECT 
    created_year,
    AVG(subscribers) AS avg_subscriber_count,
    AVG(video_views) AS avg_video_views
FROM my_table
GROUP BY created_year
ORDER BY created_year"""
    
# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,created_year,avg_subscriber_count,avg_video_views
0,1970.0,36300000.0,3010785000.0
1,2005.0,26957140.0,17073010000.0
2,2006.0,37693180.0,25768650000.0
3,2007.0,29971430.0,18626690000.0
4,2008.0,25713790.0,16372270000.0
5,2009.0,26410000.0,13622010000.0
6,2010.0,19900000.0,10949200000.0
7,2011.0,22658330.0,12003170000.0
8,2012.0,26195120.0,11686540000.0
9,2013.0,24572340.0,11198540000.0


In [68]:
#What are the oldest YouTube channels in the top 100 by subscribers? globally?
query = """SELECT rank, Youtuber, created_year, created_month, created_date
FROM my_table
WHERE rank <= 100
ORDER BY created_year, created_month, created_date
LIMIT 100"""
    
# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,rank,youtuber,created_year,created_month,created_date
0,22,Zee TV,2005.0,Dec,11.0
1,70,Rihanna,2005.0,Nov,6.0
2,64,Shakira,2005.0,Oct,16.0
3,71,Ishtar Music,2005.0,Sep,22.0
4,30,Movieclips,2006.0,Apr,28.0
5,40,Ed Sheeran,2006.0,Aug,8.0
6,47,Get Movies,2006.0,Dec,16.0
7,38,YRF,2006.0,Jun,7.0
8,51,A4,2006.0,Mar,11.0
9,1,T-Series,2006.0,Mar,13.0


In [69]:
# Which country has the most YouTube channels in the top 100 globally?

query = """ SELECT 
    Country, 
    COUNT(Youtuber) AS channel_count
FROM 
    my_table
GROUP BY 
    Country"""
    
# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,country,channel_count
0,Argentina,11
1,Australia,2
2,Barbados,1
3,Brazil,33
4,Canada,6
5,Chile,3
6,China,1
7,Colombia,9
8,Cuba,1
9,Ecuador,2


In [70]:
query = """ SELECT Youtuber, AVG(subscribers_for_last_30_days) AS avg_subscribers_per_month
FROM my_table
GROUP BY Youtuber"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,youtuber,avg_subscribers_per_month
0,123 GO! Spanish,100000.0
1,5-Minute Crafts PLAY,100000.0
2,50 Cent,100000.0
3,7clouds,300000.0
4,A2 Motivation by Arvind Arora,200000.0
5,A4,10.0
6,ABC News,100000.0
7,ABP NEWS,400000.0
8,ABS-CBN Entertainment,300000.0
9,ABS-CBN News,100000.0


In [27]:
#Which channels gained the most subscribers?
query = """ SELECT Youtuber, subscribers_for_last_30_days
FROM my_table
ORDER BY subscribers_for_last_30_days DESC"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,youtuber,subscribers_for_last_30_days
0,MrBeast,8000000.0
1,DaFuq!?Boom!,6700000.0
2,Jess No Limit,5500000.0
3,T-Series,2000000.0
4,PANDA BOI,2000000.0
5,Ricis Official,1900000.0
6,TheDonato,1900000.0
7,Topper Guild,1900000.0
8,Infobells - Hindi,1600000.0
9,Ishaan Ali 11,1600000.0


In [28]:
#Which channels had the highest video views?
query = """ SELECT Youtuber, video_views
FROM my_table
ORDER BY video_views DESC """

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,youtuber,video_views
0,T-Series,228000000000.0
1,Cocomelon - Nursery Rhymes,164000000000.0
2,SET India,148000000000.0
3,Sony SAB,101000000000.0
4,Like Nastya,90479060000.0
5,WWE,77428470000.0
6,Vlad and Niki,77180170000.0
7,Zee TV,73139050000.0
8,Colors TV,61510910000.0
9,Movieclips,59316470000.0


In [30]:
#What are the estimated monthly and yearly earnings for the top 25 YouTube channels?
query = """ SELECT Rank, Youtuber, lowest_monthly_earnings, highest_monthly_earnings, lowest_yearly_earnings, highest_yearly_earnings,
       lowest_monthly_earnings * 12 AS estimated_yearly_low,
       highest_monthly_earnings * 12 AS estimated_yearly_high
FROM my_table
ORDER BY Rank
LIMIT 25"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,rank,youtuber,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings,estimated_yearly_low,estimated_yearly_high
0,1,T-Series,564600.0,9000000.0,6800000.0,108400000.0,6775200.0,108000000.0
1,3,MrBeast,337000.0,5400000.0,4000000.0,64700000.0,4044000.0,64800000.0
2,4,Cocomelon - Nursery Rhymes,493800.0,7900000.0,5900000.0,94800000.0,5925600.0,94800000.0
3,5,SET India,455900.0,7300000.0,5500000.0,87500000.0,5470800.0,87600000.0
4,9,Like Nastya,12200.0,195800.0,146800.0,2300000.0,146400.0,2349600.0
5,10,Vlad and Niki,145100.0,2300000.0,1700000.0,27900000.0,1741200.0,27600000.0
6,11,Zee Music Company,200900.0,3200000.0,2400000.0,38600000.0,2410800.0,38400000.0
7,12,WWE,178700.0,2900000.0,2100000.0,34300000.0,2144400.0,34800000.0
8,14,BLACKPINK,124700.0,2000000.0,1500000.0,23900000.0,1496400.0,24000000.0
9,16,Sony SAB,414300.0,6600000.0,5000000.0,79600000.0,4971600.0,79200000.0


In [31]:
#Which channels have the highest estimated earnings?
query = """ SELECT Youtuber, 
       ((lowest_monthly_earnings + highest_monthly_earnings) / 2) * 12 AS estimated_yearly_earnings
FROM my_table
ORDER BY estimated_yearly_earnings DESC """

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,youtuber,estimated_yearly_earnings
0,DaFuq!?Boom!,58656000.0
1,T-Series,57387600.0
2,Cocomelon - Nursery Rhymes,50362800.0
3,SET India,46535400.0
4,Zee TV,43360800.0
5,StarPlus,42700800.0
6,Sony SAB,42085800.0
7,GR6 EXPLODE,41452200.0
8,ViralHog,35045400.0
9,MrBeast,34422000.0


In [71]:
#How do YouTube channels from different countries compare in terms of average subscribers and views?


query = """ 
SELECT Country, 
       AVG(subscribers) AS avg_subscribers,
       AVG(video_views) AS avg_video_views
FROM my_table
GROUP BY Country
ORDER BY avg_subscribers DESC, avg_video_views DESC """

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,country,avg_subscribers,avg_video_views
0,Cuba,46300000.0,22936630000.0
1,El Salvador,46100000.0,10323390000.0
2,Barbados,41900000.0,22477750000.0
3,South Korea,40575000.0,16169080000.0
4,Pakistan,32475000.0,27162910000.0
5,Venezuela,31200000.0,9673649000.0
6,Canada,31016670.0,19049040000.0
7,Kuwait,30500000.0,4521574000.0
8,Chile,29066670.0,9782519000.0
9,India,27539570.0,15392990000.0


In [33]:
#WHAT ARE THE TOP 10/25 YOUTUBE CHANNELS BY THE NUMBER OF SUBSCRIBERS?

query = """SELECT Rank, Youtuber, subscribers
FROM my_table
ORDER BY subscribers DESC
LIMIT 25 """
# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,rank,youtuber,subscribers
0,1,T-Series,245000000
1,3,MrBeast,166000000
2,4,Cocomelon - Nursery Rhymes,162000000
3,5,SET India,159000000
4,9,Like Nastya,106000000
5,10,Vlad and Niki,98900000
6,11,Zee Music Company,96700000
7,12,WWE,96000000
8,14,BLACKPINK,89800000
9,16,Sony SAB,83000000


In [72]:
query = """ SELECT 
    Latitude, 
    Longitude, 
    COUNT(channel_type) AS channel_count
FROM 
    my_table
GROUP BY 
    Latitude, 
    Longitude
ORDER BY 
    channel_count DESC
    """

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,latitude,longitude,channel_count
0,37.09024,-95.712891,179
1,20.593684,78.96288,139
2,-14.235004,-51.92528,33
3,55.378051,-3.435973,26
4,-0.789275,113.921327,19
5,23.634501,-102.552784,17
6,15.870032,100.992541,13
7,61.52401,105.318756,13
8,-38.416097,-63.616672,11
9,40.463667,-3.74922,11


In [73]:


query_highest_views = """
SELECT Youtuber, SUM("video views") AS total_views
FROM youtube_data
GROUP BY Youtuber
ORDER BY total_views DESC;
"""
# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)


Unnamed: 0,latitude,longitude,channel_count
0,37.09024,-95.712891,179
1,20.593684,78.96288,139
2,-14.235004,-51.92528,33
3,55.378051,-3.435973,26
4,-0.789275,113.921327,19
5,23.634501,-102.552784,17
6,15.870032,100.992541,13
7,61.52401,105.318756,13
8,-38.416097,-63.616672,11
9,40.463667,-3.74922,11


In [74]:
query_content_popularity = """SELECT category, SUM("video views") AS total_views, AVG(subscribers_for_last_30_days) AS avg_engagement
FROM youtube_data
GROUP BY category
ORDER BY total_views DESC;
"""
# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)


Unnamed: 0,latitude,longitude,channel_count
0,37.09024,-95.712891,179
1,20.593684,78.96288,139
2,-14.235004,-51.92528,33
3,55.378051,-3.435973,26
4,-0.789275,113.921327,19
5,23.634501,-102.552784,17
6,15.870032,100.992541,13
7,61.52401,105.318756,13
8,-38.416097,-63.616672,11
9,40.463667,-3.74922,11


In [76]:

query_population_correlation = """
SELECT Country, Population, COUNT(Youtuber) AS channel_count
FROM youtube_data
GROUP BY Country;
"""
# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)


Unnamed: 0,latitude,longitude,channel_count
0,37.09024,-95.712891,179
1,20.593684,78.96288,139
2,-14.235004,-51.92528,33
3,55.378051,-3.435973,26
4,-0.789275,113.921327,19
5,23.634501,-102.552784,17
6,15.870032,100.992541,13
7,61.52401,105.318756,13
8,-38.416097,-63.616672,11
9,40.463667,-3.74922,11
