# Spotify: Songs to Incorporate In Model

In [1]:
import csv
import json
import numpy as np
import re
import pandas as pd
import pickle
from sqlalchemy import create_engine

## Querying Current Song & Artist Listing

In [2]:
engine = create_engine('postgresql://postgres:glide-mortuary-pod-cloy-belong@ec2-54-244-70-11.us-west-2.compute.amazonaws.com:5432/postgres')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f48b5531b70>

In [5]:
song_df = pd.read_sql('''
                SELECT * FROM spotify_song_list;
                ''', con=engine)
artist_df = pd.read_sql('''
                SELECT * FROM spotify_artists;
                ''', con=engine)

In [6]:
song_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23888 entries, 0 to 23887
Data columns (total 8 columns):
s_song_id             23888 non-null object
album_release_date    23888 non-null object
artist_id             23888 non-null object
artist_name           23888 non-null object
duration_ms           23888 non-null float64
explicit              23888 non-null bool
linked_album          23888 non-null object
song_title            23888 non-null object
dtypes: bool(1), float64(1), object(6)
memory usage: 1.3+ MB


In [10]:
artist_df.head()

Unnamed: 0,s_artist_id,name,popularity,followers,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,...,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
0,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,71,747136.0,'boy band','dance pop','europop','pop','post-teen pop',,...,,,,,,,,,,
1,1bDWGdIC2hardyt55nlQgG,"""Weird Al"" Yankovic",59,337751.0,'antiviral pop','comedy rock','comic',,,,...,,,,,,,,,,
2,0REMf7H0VP6DwfZ9MbuWph,10 Years,60,243895.0,'alternative metal','nu metal','post-grunge','rap metal','rap rock',,...,,,,,,,,,,
3,0MBIKH9DjtBkv8O3nS6szj,"10,000 Maniacs",52,108829.0,'alternative rock','folk','folk-pop','lilith','mellow gold','new wave pop',...,,,,,,,,,,
4,7urq0VfqxEYEEiZUkebXT4,112,68,455231.0,'boy band','dance pop','gangster rap','hip hop','hip pop','new jack swing',...,'southern hip hop','urban contemporary',,,,,,,,


### Removing Non-Music

When I initially pulled this information, I included artits who weren't actually musicians. I can either now manually remove them from my listing, or use filtering based on their genres to drop non-music acts. I'll look into the genres I have indexed, and see which ones would not be musically related. The thought is, the corresponding artists tied to those genres wouldn't be musicians.

Comedy is the biggest category of non-music, from my cursory searching, so I'll start with that.

#### What Genres Make Up Non-Music?

In [61]:
with open('../data/all_genres.pkl', 'rb') as f:
    all_genres = pickle.load(f)

#### Removing Comedians

In [11]:
pot_comics = pd.read_sql("""
            SELECT * FROM spotify_artists
            WHERE genre_0 LIKE '%%comedy%%' 
            OR genre_0 LIKE '%%comic%%'
            """, engine)

##### Examining Each Comedian Individually

In [42]:
pd.read_sql("""
            SELECT * FROM spotify_song_list
            WHERE artist_id = '1Bd4UVlqlaKEXYRG3wgrCK'
            """, engine)

Unnamed: 0,s_song_id,album_release_date,artist_id,artist_name,duration_ms,explicit,linked_album,song_title
0,4f80LjiI9fgwCahAhEOuYk,1998-01-01,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,172493.0,False,The Prince of Egypt,Playing With The Big Boys - The Prince Of Egyp...
1,2Od3hkKwZl831g0TZR6v22,1986-01-01,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,148960.0,False,Little Shop Of Horrors (Original Motion Pictur...,Dentist!
2,18yHxT4ZwGg6401pwwzAhJ,1979-09-14,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,186226.0,False,Comedy Is Not Pretty,Googlephonics
3,3P06pRG3fiBzlFjOPV0WMU,2017-09-22,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,198080.0,False,“The Long-Awaited Album”,Caroline
4,6sVBfdmhr8SrqUJDynPZKQ,2017-09-22,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,153106.0,False,“The Long-Awaited Album”,On The Water
5,4ZACbQfI7CkaVVRHDkfR3K,2017-09-22,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,191813.0,False,“The Long-Awaited Album”,Santa Fe
6,0ccllnXp7eTzhgPwotvDla,2013-01-01,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,195106.0,False,Love Has Come For You,When You Get To Asheville
7,1LhC7Qvwpv5ze4YYsCAOjP,2017-09-22,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,224253.0,False,“The Long-Awaited Album”,All Night Long
8,29vnfwpI3WnIUCGQk86dp3,2017-09-22,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,181600.0,False,“The Long-Awaited Album”,So Familiar
9,4oNK1Chm0IN8d42MQ3KRbG,2017-09-22,1Bd4UVlqlaKEXYRG3wgrCK,Steve Martin,119973.0,False,“The Long-Awaited Album”,Office Supplies


##### Verdict

It looks like I can safely remove almost everyone's songs on this listing for my overall dataset. The exceptions being:

|Artist | Artist ID|
|---|---|
| Adam Sandler | 2LB9H0px4qWbCHg9Axzhga |
|Rodney Carrington | 7xsMZdxw6eEJXGTLZulONO|
|Bowling for Soup | 5ND0mGcL9SKSjWIjPd0xIb|
|Cledus T. Judd | 1AhfLNt7teChymEclbsHlo|
|Insane Clown Posse | 4xtWjIlVuZwTCeqVAsgEXy|
|Reel Big Fish | 3bXhZFreBJF4QDUUiMmtZW|
|The Darkness | 5r1bdqzhgRoHC3YcCV6N5a|


In [12]:
comic_array = pot_comics['s_artist_id']
non_comics = ['2LB9H0px4qWbCHg9Axzhga', '7xsMZdxw6eEJXGTLZulONO','5ND0mGcL9SKSjWIjPd0xIb', 
              '1AhfLNt7teChymEclbsHlo', '4xtWjIlVuZwTCeqVAsgEXy','3bXhZFreBJF4QDUUiMmtZW', 
              '5r1bdqzhgRoHC3YcCV6N5a']
comics_to_remove = []
for comic in comic_array:
    if comic not in non_comics:
        comics_to_remove.append(comic)

#### Moving Non-Artists and Their Works to Separate Tables

In [13]:
non_artists = artist_df.query("s_artist_id in @comics_to_remove")

In [15]:
non_songs = song_df.query("artist_id in @comics_to_remove")

In [16]:
artist_df.drop(labels=non_artists.index, inplace=True)

In [17]:
song_df.drop(labels=non_songs.index, inplace=True)

#### Removing Extraneous genre columns from `artist_df`

In [18]:
artist_df.drop(labels=['genre_{}'.format(i) for i in range(19)], axis=1, inplace=True)

In [19]:
artist_df.set_index('s_artist_id', inplace=True)

##### Saving New Lists as `csv`

In [20]:
non_artists.to_csv('../data/non_artists.csv')
non_songs.to_csv('../data/non_songs.csv')
artist_df.to_csv('../data/artist_list_v2.csv')
song_df.to_csv('../data/song_list_v2.csv')

In [21]:
artist_df.head()

Unnamed: 0_level_0,name,popularity,followers
s_artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,71,747136.0
1bDWGdIC2hardyt55nlQgG,"""Weird Al"" Yankovic",59,337751.0
0REMf7H0VP6DwfZ9MbuWph,10 Years,60,243895.0
0MBIKH9DjtBkv8O3nS6szj,"10,000 Maniacs",52,108829.0
7urq0VfqxEYEEiZUkebXT4,112,68,455231.0
