In [4]:
# dependencies
import chardet
import pandas as pd
import os 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# API
import requests
import json

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, text, inspect, func


In [5]:
# Load in file
# Store filepath in a variable
YOUTUBE_PATH = "Resources/global_youtube_stats.csv"

In [6]:
with open(YOUTUBE_PATH, 'rb') as csv_file:
    raw_content = csv_file.read()
    chardet_result = chardet.detect(raw_content)
print(chardet_result)

FileNotFoundError: [Errno 2] No such file or directory: 'Resources/global_youtube_stats.csv'

In [None]:
# Read and display the CSV with Pandas
df = pd.read_csv(YOUTUBE_PATH, encoding=chardet_result['encoding'])
df.head()

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
# pd.set_option('display.max_rows', None)  # Set the option to display all rows
#print(df)

In [None]:
# Assuming your DataFrame is named df
# Rename specific columns
cleaned_df = df.rename(columns={
    'Gross tertiary education enrollment (%)': 'gross education enrollment percentage',
    'Youtuber': 'youtuber',
    'Title': 'title',
    'Abbreviation': 'abbreviation',
    'Country' : 'country',
    'Population': 'population',
    'Unemployment rate': 'unemployment rate',
    'Urban_population' : 'urban_population',
    'Latitude' : 'latitude',
    'Longitude' : 'longitude',
    'video views' : 'video_views'
    })

# Display the DataFrame with renamed columns
cleaned_df.head()

In [None]:
# Specify the URI for the cleaned database
db_uri = 'sqlite:///data.db'

# Create an engine using the specified URI
engine = create_engine(db_uri)

# Write the cleaned DataFrame to a SQLite database table named 'my_table'
cleaned_df.to_sql('my_table', engine, if_exists='replace', index=False)

# Confirm that the data has been successfully written to the database
df_from_db = pd.read_sql('my_table', engine)
print(df_from_db)

In [None]:
# INSPECT to confirm existence
# Create the inspector and connect it to the engine
inspector = inspect(engine)

# Collect the names of tables within the database
tables = inspector.get_table_names()

print(tables)

In [None]:
#1a. Define the SQL query top channels by subscribers
query = """
SELECT Rank, Youtuber, Subscribers, country, channel_type
FROM my_table
ORDER BY Subscribers DESC
LIMIT 25
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

In [None]:
#3.Define the SQL query top channels by video views
query = """
SELECT Rank, Youtuber,subscribers, video_views, country
FROM my_table
ORDER BY Subscribers DESC
LIMIT 25
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

In [None]:
# 5. Define the SQL query top channels the high video views
query = """
SELECT Rank, Youtuber, video_views, country
FROM my_table
ORDER BY Subscribers DESC
LIMIT 25
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

In [None]:
# Define the SQL query
query = """
SELECT youtuber, subscribers, category, country,  MAX(highest_yearly_earnings) AS highest_yearly_earnings
FROM my_table
GROUP BY youtuber, subscribers, category
ORDER BY youtuber DESC, highest_yearly_earnings DESC
LIMIT 50;
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(50)

In [None]:
# Define the SQL query
query = """
SELECT channel_type, youtuber, subscribers, category, country, SUM(video_views) AS total_views
FROM my_table
GROUP BY channel_type
ORDER BY total_views DESC
LIMIT 25;
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

In [None]:
# Define the SQL query
query = """
SELECT youtuber, subscribers, category,channel_type, country, uploads, SUM(uploads) AS total_uploads
FROM my_table
GROUP BY uploads
ORDER BY total_uploads DESC
LIMIT 25;
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

In [None]:
query = """ 
SELECT Youtuber, SUM(video_views) AS total_video_views
FROM my_table
GROUP BY Youtuber
ORDER BY total_video_views DESC
LIMIT 25;
"""

# Execute the query and read the results into a DataFrame
df = pd.read_sql(text(query), con=engine)
df.head(25)

In [None]:
# close the engine
engine.dispose()