In [12]:
import pandas as pd

data_path = 'Resources/global_youtube_statistics.csv'

# Attempt to read the file with different encodings
encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
data = None

for encoding in encodings:
    try:
        data = pd.read_csv(data_path, encoding=encoding)
        print(f"Successfully read with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed to read with encoding: {encoding}")

if data is not None:
    # Display the first few rows of the dataframe
    print(data.head())
else:
    print("Failed to read the file with all attempted encodings.")


Failed to read with encoding: utf-8
Successfully read with encoding: latin1
   rank                    Youtuber  subscribers   video views  \
0     1                    T-Series    245000000  2.280000e+11   
1     2              YouTube Movies    170000000  0.000000e+00   
2     3                     MrBeast    166000000  2.836884e+10   
3     4  Cocomelon - Nursery Rhymes    162000000  1.640000e+11   
4     5                   SET India    159000000  1.480000e+11   

           category                       Title  uploads        Country  \
0             Music                    T-Series    20082          India   
1  Film & Animation               youtubemovies        1  United States   
2     Entertainment                     MrBeast      741  United States   
3         Education  Cocomelon - Nursery Rhymes      966  United States   
4             Shows                   SET India   116536          India   

  Abbreviation   channel_type  ...  subscribers_for_last_30_days  \
0       

In [13]:
# Create a new connection
conn = sqlite3.connect('youtube_data.db')

# Query to check the table schema
query_schema = "PRAGMA table_info(youtube_data);"
schema = pd.read_sql_query(query_schema, conn)

# Display the table schema
print(schema)



    cid                                     name     type  notnull dflt_value  \
0     0                                     rank  INTEGER        0       None   
1     1                                 Youtuber     TEXT        0       None   
2     2                              subscribers  INTEGER        0       None   
3     3                              video views     REAL        0       None   
4     4                                 category     TEXT        0       None   
5     5                                    Title     TEXT        0       None   
6     6                                  uploads  INTEGER        0       None   
7     7                                  Country     TEXT        0       None   
8     8                             Abbreviation     TEXT        0       None   
9     9                             channel_type     TEXT        0       None   
10   10                         video_views_rank     REAL        0       None   
11   11                     

In [14]:
query_categories_views = """
SELECT category, SUM("video views") AS total_views
FROM youtube_data
GROUP BY category
ORDER BY total_views DESC;
"""
categories_views = pd.read_sql_query(query_categories_views, conn)
print(categories_views)


                 category   total_views
0                   Music  3.121478e+12
1           Entertainment  2.527739e+12
2          People & Blogs  1.265791e+12
3                  Gaming  7.176389e+11
4               Education  6.966145e+11
5                  Comedy  5.501128e+11
6        Film & Animation  5.444225e+11
7                   Shows  4.462068e+11
8                    None  3.046434e+11
9         News & Politics  2.703919e+11
10          Howto & Style  2.018175e+11
11                 Sports  1.479863e+11
12   Science & Technology  6.697675e+10
13         Pets & Animals  4.486675e+10
14               Trailers  3.326272e+10
15                 Movies  1.538326e+10
16       Autos & Vehicles  1.500346e+10
17  Nonprofits & Activism  1.086291e+10
18        Travel & Events  3.140883e+09


In [15]:
query_highest_views = """
SELECT Youtuber, SUM("video views") AS total_views
FROM youtube_data
GROUP BY Youtuber
ORDER BY total_views DESC;
"""
highest_views = pd.read_sql_query(query_highest_views, conn)
print(highest_views)


                       Youtuber   total_views
0                      T-Series  2.280000e+11
1    Cocomelon - Nursery Rhymes  1.640000e+11
2                     SET India  1.480000e+11
3                      Sony SAB  1.010000e+11
4           ýýý Kids Diana Show  9.324704e+10
..                          ...           ...
990                        News  0.000000e+00
991                       Music  0.000000e+00
992           Minecraft - Topic  0.000000e+00
993                        Live  0.000000e+00
994                      Gaming  0.000000e+00

[995 rows x 2 columns]


In [16]:
query_top_uploads = """
SELECT Youtuber, uploads
FROM youtube_data
ORDER BY uploads DESC
LIMIT 25;
"""
top_uploads = pd.read_sql_query(query_top_uploads, conn)
print(top_uploads)


                                           Youtuber  uploads
0                                          ABP NEWS   301308
1                               GMA Integrated News   296272
2                                   TV9 Bharatvarsh   293516
3                                           Aaj Tak   283775
4                                           IndiaTV   273255
5                                          KOMPASTV   269050
6                                   Thairath Online   244899
7                                           News 24   211620
8                                      ABS-CBN News   209520
9                                        TEDx Talks   200933
10                            ABS-CBN Entertainment   193890
11                                         Geo News   190093
12                                     News18 India   182742
13                                         Zee News   180092
14                         AlArabiya ï¿½ï¿½ï¿½ï¿½ï¿   169304
15                      

In [17]:
query_country_distribution = """
SELECT Country, COUNT(Youtuber) AS channel_count, SUM("video views") AS total_views
FROM youtube_data
GROUP BY Country
ORDER BY total_views DESC;
"""
country_distribution = pd.read_sql_query(query_country_distribution, conn)
print(country_distribution)


                 Country  channel_count   total_views
0          United States            313  3.690292e+12
1                  India            168  2.281048e+12
2                   None            122  1.185152e+12
3                 Brazil             62  4.812090e+11
4         United Kingdom             43  4.305665e+11
5               Thailand             18  2.643180e+11
6                 Russia             16  2.329963e+11
7            South Korea             17  2.255917e+11
8                 Mexico             33  1.982199e+11
9              Argentina             13  1.944154e+11
10                Canada             15  1.913206e+11
11              Colombia             11  1.541978e+11
12             Indonesia             28  1.510107e+11
13                 Spain             22  1.325207e+11
14           Philippines             12  1.291280e+11
15              Pakistan              6  1.132934e+11
16                Turkey              4  9.239924e+10
17                 Japan    

In [18]:
query_gained_subscribers = """
SELECT Youtuber, subscribers_for_last_30_days
FROM youtube_data
ORDER BY subscribers_for_last_30_days DESC;
"""
gained_subscribers = pd.read_sql_query(query_gained_subscribers, conn)
print(gained_subscribers)


                       Youtuber  subscribers_for_last_30_days
0                       MrBeast                     8000000.0
1                  DaFuq!?Boom!                     6700000.0
2                 Jess No Limit                     5500000.0
3    ZAMZAM ELECTRONICS TRADING                     3400000.0
4                   BeatboxJCOP                     3400000.0
..                          ...                           ...
990            Pari's Lifestyle                           NaN
991             DisneyChannelUK                           NaN
992                        TKOR                           NaN
993                   ANNA KOVA                           NaN
994               Avril Lavigne                           NaN

[995 rows x 2 columns]


In [19]:
query_content_popularity = """
SELECT category, SUM("video views") AS total_views, AVG(subscribers_for_last_30_days) AS avg_engagement
FROM youtube_data
GROUP BY category
ORDER BY total_views DESC;
"""
content_popularity = pd.read_sql_query(query_content_popularity, conn)
print(content_popularity)


                 category   total_views  avg_engagement
0                   Music  3.121478e+12   200919.206612
1           Entertainment  2.527739e+12   415748.503226
2          People & Blogs  1.265791e+12   386439.852273
3                  Gaming  7.176389e+11   218644.728814
4               Education  6.966145e+11   308823.529412
5                  Comedy  5.501128e+11   450005.818182
6        Film & Animation  5.444225e+11   520084.714286
7                   Shows  4.462068e+11   500000.153846
8                    None  3.046434e+11   580051.657143
9         News & Politics  2.703919e+11   268095.238095
10          Howto & Style  2.018175e+11   200000.176471
11                 Sports  1.479863e+11   288888.888889
12   Science & Technology  6.697675e+10   206669.466667
13         Pets & Animals  4.486675e+10   325001.250000
14               Trailers  3.326272e+10   300000.000000
15                 Movies  1.538326e+10   400000.000000
16       Autos & Vehicles  1.500346e+10   400000

In [20]:
query_earnings = """
SELECT Youtuber, highest_monthly_earnings, highest_yearly_earnings
FROM youtube_data
ORDER BY highest_monthly_earnings DESC
LIMIT 25;
"""
earnings = pd.read_sql_query(query_earnings, conn)
print(earnings)


                      Youtuber  highest_monthly_earnings  \
0             ýýýýýýýýý KIMPRO                13600000.0   
1                 DaFuq!?Boom!                 9200000.0   
2                     T-Series                 9000000.0   
3          KL BRO Biju Rithvik                 8100000.0   
4   Cocomelon - Nursery Rhymes                 7900000.0   
5                    SET India                 7300000.0   
6                       Zee TV                 6800000.0   
7                     StarPlus                 6700000.0   
8                     Sony SAB                 6600000.0   
9                  GR6 EXPLODE                 6500000.0   
10  ZAMZAM ELECTRONICS TRADING                 6400000.0   
11               BETER Bï¿½ï¿½                 5900000.0   
12                 BeatboxJCOP                 5500000.0   
13                    ViralHog                 5500000.0   
14                     MrBeast                 5400000.0   
15                 HAR PAL GEO          

In [21]:
query_country_comparison = """
SELECT Country, AVG(subscribers) AS avg_subscribers, AVG("video views") AS avg_views
FROM youtube_data
GROUP BY Country;
"""
country_comparison = pd.read_sql_query(query_country_comparison, conn)
print(country_comparison)


                 Country  avg_subscribers     avg_views
0                   None     2.395246e+07  9.714364e+09
1            Afghanistan     2.040000e+07  1.339700e+10
2                Andorra     1.510000e+07  2.400038e+09
3              Argentina     2.526923e+07  1.495503e+10
4              Australia     1.911111e+07  7.682424e+09
5             Bangladesh     1.390000e+07  1.212958e+10
6               Barbados     4.190000e+07  2.247775e+10
7                 Brazil     1.970645e+07  7.761435e+09
8                 Canada     2.426000e+07  1.275470e+10
9                  Chile     2.906667e+07  9.782519e+09
10                 China     1.760000e+07  2.977742e+09
11              Colombia     2.404545e+07  1.401798e+10
12                  Cuba     4.630000e+07  2.293663e+10
13               Ecuador     1.385000e+07  2.746231e+09
14                 Egypt     1.530000e+07  4.912370e+09
15           El Salvador     4.610000e+07  1.032339e+10
16               Finland     1.320000e+07  2.036

In [22]:
query_population_correlation = """
SELECT Country, Population, COUNT(Youtuber) AS channel_count
FROM youtube_data
GROUP BY Country;
"""
population_correlation = pd.read_sql_query(query_population_correlation, conn)
print(population_correlation)


                 Country    Population  channel_count
0                   None           NaN            122
1            Afghanistan  3.804175e+07              1
2                Andorra           NaN              1
3              Argentina  4.493871e+07             13
4              Australia  2.576660e+07              9
5             Bangladesh  1.673108e+08              1
6               Barbados  2.870250e+05              1
7                 Brazil  2.125594e+08             62
8                 Canada  3.699198e+07             15
9                  Chile  1.895204e+07              3
10                 China  1.397715e+09              1
11              Colombia  5.033944e+07             11
12                  Cuba  1.133348e+07              1
13               Ecuador  1.737366e+07              2
14                 Egypt  1.003881e+08              2
15           El Salvador  6.453553e+06              1
16               Finland  5.520314e+06              1
17                France  6.

In [23]:
query_education_correlation = """
SELECT Country, Grossyeducation_enrollment, SUM("video views") AS total_views
FROM youtube_data
GROUP BY Country;
"""
education_correlation = pd.read_sql_query(query_education_correlation, conn)
print(education_correlation)


DatabaseError: Execution failed on sql '
SELECT Country, Gross_tertiary_education_enrollment, SUM("video views") AS total_views
FROM youtube_data
GROUP BY Country;
': no such column: Gross_tertiary_education_enrollment

In [24]:
query_unemployment_impact = """
SELECT Country, Unemployment_rate, COUNT(Youtuber) AS channel_count, SUM("video views") AS total_views
FROM youtube_data
GROUP BY Country;
"""
unemployment_impact = pd.read_sql_query(query_unemployment_impact, conn)
print(unemployment_impact)


DatabaseError: Execution failed on sql '
SELECT Country, Unemployment_rate, COUNT(Youtuber) AS channel_count, SUM("video views") AS total_views
FROM youtube_data
GROUP BY Country;
': no such column: Unemployment_rate

In [26]:
query_upload_frequency = """
SELECT Youtuber, AVG(uploads) AS avg_uploads, AVG(subscribers_for_last_30_days) AS avg_subscriber_growth
FROM youtube_data
GROUP BY Youtuber;
"""
upload_frequency = pd.read_sql_query(query_upload_frequency, conn)
print(upload_frequency)


                                      Youtuber  avg_uploads  \
0                              #Refugio Mental        846.0   
1                             #Refï¿½ï¿½ï¿½ï¿½        846.0   
2                              123 GO! Spanish        520.0   
3                        1MILLION Dance Studio          0.0   
4                         1theK (ï¿½ï¿½ï¿½ï¿½ï      18950.0   
..                                         ...          ...   
990                ýýýýýýýýýýýýýýýý - Al-Remas       2133.0   
991                         ýýýýýýýýýýýýýýýýýý        608.0   
992  ýýýýýýýýýýýýýýýýýýýý ýýýýýýýýýýýýýýýýýýýý         65.0   
993                      ýýýýýýýýýýýýýýýýýýýýý        364.0   
994                     ýýýýýýýýýýýýýýýýýýýýýý        530.0   

     avg_subscriber_growth  
0                      NaN  
1                      NaN  
2                 100000.0  
3                      NaN  
4                      NaN  
..                     ...  
990                    NaN  
991        

In [27]:
query_category_top_100 = """
SELECT category, COUNT(*) AS channel_count
FROM youtube_data
ORDER BY subscribers DESC
LIMIT 100;
"""
category_top_100 = pd.read_sql_query(query_category_top_100, conn)
print(category_top_100)


  category  channel_count
0    Music            995


In [28]:
query_category_comparison = """
SELECT category, AVG("video views") AS avg_views, AVG(highest_monthly_earnings) AS avg_monthly_earnings
FROM youtube_data
GROUP BY category;
"""
category_comparison = pd.read_sql_query(query_category_comparison, conn)
print(category_comparison)


                 category     avg_views  avg_monthly_earnings
0                    None  6.622682e+09          9.844537e+05
1        Autos & Vehicles  7.501729e+09          1.086350e+06
2                  Comedy  7.972649e+09          6.683122e+05
3               Education  1.548032e+10          6.951778e+05
4           Entertainment  1.048854e+10          6.226426e+05
5        Film & Animation  1.183527e+10          7.327018e+05
6                  Gaming  7.634456e+09          2.719054e+05
7           Howto & Style  5.045439e+09          1.927991e+05
8                  Movies  7.691628e+09          4.547000e+05
9                   Music  1.545286e+10          5.522996e+05
10        News & Politics  1.039969e+10          6.426320e+05
11  Nonprofits & Activism  5.431456e+09          3.904000e+05
12         People & Blogs  9.589327e+09          5.468421e+05
13         Pets & Animals  1.121669e+10          7.944322e+05
14   Science & Technology  3.939809e+09          2.020432e+05
15      

In [29]:
query_geographic_distribution = """
SELECT Latitude, Longitude, COUNT(Youtuber) AS channel_count
FROM youtube_data
GROUP BY Latitude, Longitude;
"""
geographic_distribution = pd.read_sql_query(query_geographic_distribution, conn)
print(geographic_distribution)


     Latitude   Longitude  channel_count
0         NaN         NaN            123
1  -38.416097  -63.616672             13
2  -35.675147  -71.542969              3
3  -25.274398  133.775136              9
4  -14.235004  -51.925280             62
5  -13.759029 -172.104629              1
6   -9.189967  -75.015152              1
7   -1.831239  -78.183406              2
8   -0.789275  113.921327             28
9    1.352083  103.819836              3
10   4.210484  101.975766              1
11   4.570868  -74.297333             11
12   6.423750  -66.589730              1
13  12.879721  121.774017             12
14  13.193887  -59.543198              1
15  13.794185  -88.896530              1
16  14.058324  108.277199              3
17  15.870032  100.992541             18
18  20.593684   78.962880            168
19  21.521757  -77.781167              1
20  23.424076   53.847818              7
21  23.634501 -102.552784             33
22  23.684994   90.356331              1
23  23.885942   

In [30]:
query_urban_population_impact = """
SELECT Country, Urban_population, COUNT(Youtuber) AS channel_count
FROM youtube_data
GROUP BY Country;
"""
urban_population_impact = pd.read_sql_query(query_urban_population_impact, conn)
print(urban_population_impact)


                 Country  Urban_population  channel_count
0                   None               NaN            122
1            Afghanistan         9797273.0              1
2                Andorra               NaN              1
3              Argentina        41339571.0             13
4              Australia        21844756.0              9
5             Bangladesh        60987417.0              1
6               Barbados           89431.0              1
7                 Brazil       183241641.0             62
8                 Canada        30628482.0             15
9                  Chile        16610135.0              3
10                 China       842933962.0              1
11              Colombia        40827302.0             11
12                  Cuba         8739135.0              1
13               Ecuador        11116711.0              2
14                 Egypt        42895824.0              2
15           El Salvador         4694702.0              1
16            

In [31]:
query_oldest_channels = """
SELECT Youtuber, created_year
FROM youtube_data
ORDER BY created_year
LIMIT 100;
"""
oldest_channels = pd.read_sql_query(query_oldest_channels, conn)
print(oldest_channels)


                 Youtuber  created_year
0             Chris Brown           NaN
1   Good Mythical Morning           NaN
2      The Game Theorists           NaN
3            LEGENDA FUNK           NaN
4            Harry Styles           NaN
..                    ...           ...
95                    ERB        2006.0
96        Fueled By Ramen        2006.0
97            CrashCourse        2006.0
98         Chad Wild Clay        2006.0
99           CollegeHumor        2006.0

[100 rows x 2 columns]


In [32]:
query_creation_year_relation = """
SELECT created_year, AVG(subscribers) AS avg_subscribers, AVG("video views") AS avg_views
FROM youtube_data
GROUP BY created_year;
"""
creation_year_relation = pd.read_sql_query(query_creation_year_relation, conn)
print(creation_year_relation)


    created_year  avg_subscribers     avg_views
0            NaN     1.800000e+07  7.240181e+09
1         1970.0     3.630000e+07  3.010785e+09
2         2005.0     2.582917e+07  1.509300e+10
3         2006.0     3.188462e+07  1.705780e+10
4         2007.0     2.490204e+07  1.465191e+10
5         2008.0     2.365217e+07  1.365319e+10
6         2009.0     2.310577e+07  1.225264e+10
7         2010.0     2.067083e+07  9.605543e+09
8         2011.0     2.180000e+07  1.038110e+10
9         2012.0     2.403824e+07  1.071023e+10
10        2013.0     2.452763e+07  9.020372e+09
11        2014.0     2.061837e+07  9.915149e+09
12        2015.0     2.124795e+07  9.231639e+09
13        2016.0     2.310130e+07  9.996402e+09
14        2017.0     1.866618e+07  6.078133e+09
15        2018.0     2.306087e+07  1.199293e+10
16        2019.0     1.863939e+07  8.710054e+09
17        2020.0     2.135000e+07  1.257974e+10
18        2021.0     1.857391e+07  1.109215e+10
19        2022.0     1.364000e+07  4.220

In [33]:
query_country_top_100 = """
SELECT Country, COUNT(Youtuber) AS channel_count
FROM youtube_data
ORDER BY subscribers DESC
LIMIT 100;
"""
country_top_100 = pd.read_sql_query(query_country_top_100, conn)
print(country_top_100)


  Country  channel_count
0   India            995
