In [1]:
# dependencies
import chardet
import pandas as pd
import os 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# API
import requests
import json

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, text, inspect, func


In [2]:
# Load in file
# Store filepath in a variable
YOUTUBE_PATH = "Resources/global_youtube_stats.csv"

In [3]:
with open(YOUTUBE_PATH, 'rb') as csv_file:
    raw_content = csv_file.read()
    chardet_result = chardet.detect(raw_content)
print(chardet_result)

{'encoding': 'ISO-8859-1', 'confidence': 0.7289153199558486, 'language': ''}


In [4]:
# Read and display the CSV with Pandas
df = pd.read_csv(YOUTUBE_PATH, encoding=chardet_result['encoding'])
df.head()

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,subscribers_for_last_30_days,created_year,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude
0,1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
1,2,YouTube Movies,170000000,0.0,Film & Animation,youtubemovies,1,United States,US,Games,...,,2006.0,Mar,5.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
2,3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
4,5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 28 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   rank                                     995 non-null    int64  
 1   Youtuber                                 995 non-null    object 
 2   subscribers                              995 non-null    int64  
 3   video views                              995 non-null    float64
 4   category                                 949 non-null    object 
 5   Title                                    995 non-null    object 
 6   uploads                                  995 non-null    int64  
 7   Country                                  873 non-null    object 
 8   Abbreviation                             873 non-null    object 
 9   channel_type                             965 non-null    object 
 10  video_views_rank                         994 non-n

In [6]:
df.dropna(inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 554 entries, 0 to 994
Data columns (total 28 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   rank                                     554 non-null    int64  
 1   Youtuber                                 554 non-null    object 
 2   subscribers                              554 non-null    int64  
 3   video views                              554 non-null    float64
 4   category                                 554 non-null    object 
 5   Title                                    554 non-null    object 
 6   uploads                                  554 non-null    int64  
 7   Country                                  554 non-null    object 
 8   Abbreviation                             554 non-null    object 
 9   channel_type                             554 non-null    object 
 10  video_views_rank                         554 non-null  

In [8]:
# pd.set_option('display.max_rows', None)  # Set the option to display all rows
#print(df)

In [9]:
# Assuming your DataFrame is named df
# Rename specific columns
cleaned_df = df.rename(columns={
    'Gross tertiary education enrollment (%)': 'gross education enrollment percentage',
    'Youtuber': 'youtuber',
    'Title': 'title',
    'Abbreviation': 'abbreviation',
    'Country' : 'country',
    'Population': 'population',
    'Unemployment rate': 'unemployment rate',
    'Urban_population' : 'urban_population',
    'Latitude' : 'latitude',
    'Longitude' : 'longitude',
    'video views' : 'video_views'
    })

# Display the DataFrame with renamed columns
cleaned_df.head()

Unnamed: 0,rank,youtuber,subscribers,video_views,category,title,uploads,country,abbreviation,channel_type,...,subscribers_for_last_30_days,created_year,created_month,created_date,gross education enrollment percentage,population,unemployment rate,urban_population,latitude,longitude
0,1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
2,3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
4,5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
8,9,Like Nastya,106000000,90479060000.0,People & Blogs,Like Nastya Vlog,493,Russia,RU,People,...,100000.0,2016.0,Jan,14.0,81.9,144373500.0,4.59,107683889.0,61.52401,105.318756


In [10]:
# Specify the URI for the cleaned database
db_uri = 'sqlite:///data.db'

# Create an engine using the specified URI
engine = create_engine(db_uri)

# Write the cleaned DataFrame to a SQLite database table named 'my_table'
cleaned_df.to_sql('my_table', engine, if_exists='replace', index=False)

# Confirm that the data has been successfully written to the database
df_from_db = pd.read_sql('my_table', engine)
print(df_from_db)

     rank                    youtuber  subscribers   video_views  \
0       1                    T-Series    245000000  2.280000e+11   
1       3                     MrBeast    166000000  2.836884e+10   
2       4  Cocomelon - Nursery Rhymes    162000000  1.640000e+11   
3       5                   SET India    159000000  1.480000e+11   
4       9                 Like Nastya    106000000  9.047906e+10   
..    ...                         ...          ...           ...   
549   990                   Migos ATL     12400000  6.993406e+09   
550   991               Natan por Aï¿     12300000  9.029610e+09   
551   992    Free Fire India Official     12300000  1.674410e+09   
552   994                 RobTopGames     12300000  3.741235e+08   
553   995                Make Joke Of     12300000  2.129774e+09   

           category                       title  uploads        country  \
0             Music                    T-Series    20082          India   
1     Entertainment              

In [11]:
# INSPECT to confirm existence
# Create the inspector and connect it to the engine
inspector = inspect(engine)

# Collect the names of tables within the database
tables = inspector.get_table_names()

print(tables)

['my_table']


In [12]:
# Define the SQL query
query = """
SELECT Rank, Youtuber, Subscribers, video_views, country
FROM my_table
ORDER BY Subscribers DESC
LIMIT 25
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

Unnamed: 0,rank,youtuber,subscribers,video_views,country
0,1,T-Series,245000000,228000000000.0,India
1,3,MrBeast,166000000,28368840000.0,United States
2,4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,United States
3,5,SET India,159000000,148000000000.0,India
4,9,Like Nastya,106000000,90479060000.0,Russia
5,10,Vlad and Niki,98900000,77180170000.0,United States
6,11,Zee Music Company,96700000,57856290000.0,India
7,12,WWE,96000000,77428470000.0,United States
8,14,BLACKPINK,89800000,32144600000.0,South Korea
9,16,Sony SAB,83000000,101000000000.0,India
