# YouTube & Spotify Top Music Artists From 2018
---

### SCOPE:
#### - Extracted, transformed, and loaded up YouTube's Top Trending Videos from December 2017 thru May 2018 for their videos categorized as music only, and created an "Artist" column to enable joining with Spotify's Top 100 Songs of 2018. Both dataframes were loaded into MySQL.


### PURPOSE:
#### - I choose this project because I'm a avid listener and a huge music and concert goer, and wanted to work with data that I was familiar with.

### Data Sources:
#### - https://www.kaggle.com/datasnaek/youtube-new (this is an updated link, whereas I used an older version of this file, which is attached in the resources)
#### - https://www.kaggle.com/nadintamer/top-spotify-tracks-of-2018
 


In [1]:
# Import Dependencies 1/2:
import os
import csv
import json
import simplejson
import numpy as np
import pandas as pd
from datetime import datetime
import sys
import string

## SQL Alchemy setup and built a search, create, and drop database function to set up loading phase after extraction and transformation of data

In [2]:
# Import Dependencies 2/2:
from sqlalchemy import create_engine, Column, Integer, String, join, Date
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy_utils import database_exists, create_database, drop_database, has_index
import pymysql
Base = declarative_base()

In [3]:
#rds_connection_string = "<inser user name>:<insert password>@127.0.0.1/customer_db"
rds_connection_string = "root:gREATNESS23$@127.0.0.1/" #youtube_spotify_2018_db"

# Can set up an input for the db_name later (optional)
db_name = 'youtube_spotify_2018_db2'

# Setup engine connection string
engine = create_engine(f'mysql://{rds_connection_string}{db_name}')

In [4]:
# 

#db_exist = database_exists(f'mysql://{rds_connection_string}youtube_spotify_2018_db2')


db_url = f'mysql://{rds_connection_string}{db_name}'

In [5]:
# Created a function incorproating SQL Alchemy to search, create, and or drop a database:
def search_create_drop_db(db_name):
    db_exist = database_exists(f'mysql://{rds_connection_string}{db_name}')
    if db_exist == True:
        drop_table_y_or_n = input(f'"{db_name}" database already exists in MySQL. Do you want you drop the table? Enter exactly: "y" or "n".  ')
        if drop_table_y_or_n == 'y':
            drop_database(db_url)
            print(f"Database {db_name} was dropped")
            create_new_db = input(f"Do you want to create another database called: {db_name}?  ")
            if create_new_db == 'y':
                create_database(db_url)
                return(f"The database {db_name} was created. Next You will need to create tables for this database.  ")
            else:
                return("No database was created. Goodbye!  ")
        else:
            return("The database exists. No action was taken. Goodbye!  ")
    else:
        create_database(db_url)
        return(f"The queried database did not exist, and was created as: {db_name} .  ")

search_create_drop_db(db_name)

"youtube_spotify_2018_db2" database already exists in MySQL. Do you want you drop the table? Enter exactly: "y" or "n".  y
Database youtube_spotify_2018_db2 was dropped
Do you want to create another database called: youtube_spotify_2018_db2?  y


'The database youtube_spotify_2018_db2 was created. Next You will need to create tables for this database.  '

In [6]:
# Create tables 'blueprints' using python classes and sql alchemy:

class yt_categories(Base):
    __tablename__ = 'yt_category_titles'
    id = Column(Integer, primary_key=True)
    category_title = Column(String(60))

class yt_statistics_data(Base):
    __tablename__ = 'yt_statistics'
    id = Column(Integer, primary_key=True)
    category_title = Column(String(60))
    trending_date = Column(Date, nullable=False)
    video_title = Column(String(200))
    channel_title = Column(String(100))
    category_id = Column(Integer)
    views = Column(Integer)
    likes = Column(Integer)
    dislikes = Column(Integer)
    view_count = Column(Integer)

class spotify_music_data(Base):
    __tablename__ = 'spotify_music'
    id = Column(Integer, primary_key=True)
    artist = Column(String(100))
    song_name = Column(String(200))
    spotify_unique_id = Column(String(100))


## Extract and transform "YouTube Top Trending Videos" dataset for USA

In [7]:
# YouTube data has two parts: 1) Categories information in JSON format
                            # 2) Top Trending US YouTube Videos in a CSV file

# Part 1) YouTube Categories are seperated in a json file
yt_json_file = './resources/youtube_US_category_id.json'
yt_rawjson_df = pd.read_json(yt_json_file)

In [8]:
# Extract the category id and category titles, and set them into a list

# for i in yt_rawjson_df['items']:
#     #print(i['id'])
#     print(i['id'] + ' | ' + i['snippet']['title'])
    

category_id = [int(i['id']) for i in yt_rawjson_df['items']]
category_title = [str(i['snippet']['title']) for i in yt_rawjson_df['items']]

# Create a dataframe of the category id and title for later use
category_id_title_df = pd.DataFrame({'category_id': category_id, 'category_title': category_title})
category_id_title_df.head()
category_id_title_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 2 columns):
category_id       32 non-null int64
category_title    32 non-null object
dtypes: int64(1), object(1)
memory usage: 592.0+ bytes


In [9]:
# Found that music category is "10"
category_id_title_df.head()

Unnamed: 0,category_id,category_title
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [10]:
# Load category_id_title_df to MySQL with Pandas
category_id_title_df.to_sql('yt_categories', con=engine)

In [11]:
# Two ways in this notebook to pull the data directly from MySQL database
# Method 1) Use SQL Alchemy Engine - Result: successfully reads from MySQL Database:
engine.execute("SELECT * FROM yt_categories").fetchall()

[(0, 1, 'Film & Animation'),
 (1, 2, 'Autos & Vehicles'),
 (2, 10, 'Music'),
 (3, 15, 'Pets & Animals'),
 (4, 17, 'Sports'),
 (5, 18, 'Short Movies'),
 (6, 19, 'Travel & Events'),
 (7, 20, 'Gaming'),
 (8, 21, 'Videoblogging'),
 (9, 22, 'People & Blogs'),
 (10, 23, 'Comedy'),
 (11, 24, 'Entertainment'),
 (12, 25, 'News & Politics'),
 (13, 26, 'Howto & Style'),
 (14, 27, 'Education'),
 (15, 28, 'Science & Technology'),
 (16, 29, 'Nonprofits & Activism'),
 (17, 30, 'Movies'),
 (18, 31, 'Anime/Animation'),
 (19, 32, 'Action/Adventure'),
 (20, 33, 'Classics'),
 (21, 34, 'Comedy'),
 (22, 35, 'Documentary'),
 (23, 36, 'Drama'),
 (24, 37, 'Family'),
 (25, 38, 'Foreign'),
 (26, 39, 'Horror'),
 (27, 40, 'Sci-Fi/Fantasy'),
 (28, 41, 'Thriller'),
 (29, 42, 'Shorts'),
 (30, 43, 'Shows'),
 (31, 44, 'Trailers')]

In [13]:
# Method 2) Read from MySQL database using Pandas - Result: Success! - This method is a better setup for analysis.
# The index column is automatically generated.
pd.read_sql_query('select * from yt_categories', con=engine).head()

Unnamed: 0,index,category_id,category_title
0,0,1,Film & Animation
1,1,2,Autos & Vehicles
2,2,10,Music
3,3,15,Pets & Animals
4,4,17,Sports


In [None]:
# Part 1) is the YouTube Top US Videos in a CSV
csv_file_yt = "./resources/youtube_USvideos.csv"
yt_rawdata_df = pd.read_csv(csv_file_yt)

In [None]:
# view rows, count and datatypes
yt_rawdata_df.info()

In [None]:
# Rename Columns
yt_cleandata_df = yt_rawdata_df.rename(columns={"video_id":"Video ID", "trending_date":"Trending Date",
                                                "title":"Title", "channel_title":"Channel Title",
                                                "category_id":"category_id", "publish_time":"Publish Time",
                                                "tags":"Tags", "views":"Views",
                                                "likes":"Likes", "dislikes":"Dislikes", 
                                                "comment_count":"Comment Count", "thumbnail_link":"Thumbnail Link",
                                                "comments_disabled":"Comments Disabled", "ratings_disabled":"Ratings Disabled",
                                                "video_error_or_removed":"Video Error Or Removed", "description":"Description"
                                               })
yt_cleandata_df.head()

In [None]:
# Drop Cells with Missing Information
yt_cleandata_df = yt_cleandata_df.dropna(how="any")

In [None]:
# Drop Dulplicates and Sort by Trending Date
yt_cleandata_df.drop_duplicates(['Video ID', 'Trending Date', 'Title', 'Channel Title', 'category_id', 'Publish Time']).sort_values(by=['Trending Date'], ascending=False).head()

In [None]:
# Drop Unwanted Columns

to_drop =['Publish Time', 'Tags', 'Thumbnail Link', 'Comments Disabled', 'Ratings Disabled', 'Video Error Or Removed', 'Description']

yt_cleandata_df.drop(to_drop, inplace=True, axis=1)

In [None]:
# Replace the "." in Trending Date to "-"

yt_cleandata_df['Trending Date'] = [x.replace(".","-") for x in yt_cleandata_df['Trending Date']]
yt_cleandata_df.head()

In [None]:
# Print out All Channel Titles
# for x in yt_cleandata_df['Channel Title'].unique():
#      print(x)

In [None]:
# Inner joined the both YouTube Tables

yt_merged_df = pd.merge(yt_cleandata_df, category_id_title_df, how='inner', on='category_id',
         left_index=False, right_index=False, sort=True)
yt_merged_df.info()

In [None]:
# View how many videos are in each category, particularly for music.
yt_merged_df['category_title'].value_counts()

## Extract and transform "Spotify Top 2018 Songs" dataset

In [None]:
# Spotify 2018 - Top 100 Songs - Raw CSV

csv_file_spotify2018 = "./resources/spotify_top2018.csv"
spotify2018_rawdata_df = pd.read_csv(csv_file_spotify2018)
spotify2018_rawdata_df.head()

In [None]:
spotify2018_rawdata_df.info()

In [None]:
# Set up Spotify DataFrame
spotify_2018_id = spotify2018_rawdata_df['id']
spotify_2018_name = spotify2018_rawdata_df['name']
spotify_2018_artists = spotify2018_rawdata_df['artists']

In [None]:
spotify2018_filtered_df = pd.DataFrame({
            "Artist": spotify_2018_artists,
            "Song Name": spotify_2018_name,
            "Spotify Unique ID": spotify_2018_id
             })
spotify2018_filtered_df.head()

In [None]:
#rds_connection_string = "<inser user name>:<insert password>@127.0.0.1/customer_db"
rds_connection_string = "root:gREATNESS23$@127.0.0.1/" #youtube_spotify_2018_db"
engine = create_engine(f'mysql://{rds_connection_string}')

In [None]:
# Can set up an input for the db_name later (optional)
db_name = 'youtube_spotify_2018_db2'

#db_exist = database_exists(f'mysql://{rds_connection_string}youtube_spotify_2018_db2')
db_exist = database_exists(f'mysql://{rds_connection_string}{db_name}')

db_url = f'mysql://{rds_connection_string}{db_name}'

In [None]:
# Created a function incorproating SQL Alchemy to search, create, and or drop a database:
def search_create_drop_db(db_name):
    if db_exist == True:
        drop_table_y_or_n = input(f'"{db_name}" database already exists in MySQL. Do you want you drop the table? Enter exactly: "y" or "n".  ')
        if drop_table_y_or_n == 'y':
            drop_database(db_url)
            print(f"Database {db_name} was dropped")
            create_new_db = input(f"Do you want to create another database called: {db_name}?  ")
            if create_new_db == 'y':
                create_database(db_url)
                return(f"The database {db_name} was created. Next You will need to create tables for this database.  ")
            else:
                return("No database was created. Goodbye!  ")
        else:
            return("The database exists. No action was taken. Goodbye!  ")
    else:
        create_database(db_url)
        return(f"The queried database did not exist, and was created as: {db_name} .  ")

search_create_drop_db(db_name)

In [None]:
# Create tables using python classes and sql alchemy:

class yt_categories(Base):
    __tablename__ = 'yt_category_titles'
    id = Column(Integer, primary_key=True)
    category_title = Column(String(60))

class yt_statistics_data(Base):
    __tablename__ = 'yt_statistics'
    id = Column(Integer, primary_key=True)
    category_title = Column(String(60))
    trending_date = Column(Date, nullable=False)
    video_title = Column(String(200))
    channel_title = Column(String(100))
    category_id = Column(Integer)
    views = Column(Integer)
    likes = Column(Integer)
    dislikes = Column(Integer)
    view_count = Column(Integer)

class spotify_music_data(Base):
    __tablename__ = 'spotify_music'
    id = Column(Integer, primary_key=True)
    artist = Column(String(100))
    song_name = Column(String(200))
    spotify_unique_id = Column(String(100))
