# Project Pre-work #

In [265]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
from tqdm import tqdm

In [50]:
# data frame all tracks with features from 1921-2020
tracks = pd.read_csv('data/data.csv')
print(tracks.shape)
tracks.head()

(169909, 19)


Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [51]:
# data frame containing the list of all unique genres according to Spotify and their average feature values
genres = pd.read_csv('data/genres.csv')
print(genres.shape)
genres.head()

(2664, 14)


Unnamed: 0,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode
0,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.166667,5,1
1,[],0.679245,0.521473,229852.6,0.401522,0.196737,0.221586,-12.679076,0.112267,112.025168,0.51592,21.682005,7,1
2,a cappella,0.621532,0.577017,193652.2,0.345694,0.003799,0.127087,-12.770211,0.095324,111.81323,0.453186,43.351819,11,1
3,abstract,0.359395,0.4595,343018.5,0.487,0.7914,0.11948,-14.092,0.04342,124.7432,0.30499,41.5,1,1
4,abstract beats,0.353347,0.6944,233824.4,0.6134,0.349403,0.102453,-6.6998,0.143453,119.3984,0.634187,58.6,10,0


## Genres Exploration ##

The `genres` serves a sources/list of the names of every genre Spotify has data on. I want to start by extracting all the subgenres of *pop* from the `genres`. This list will then be used to filter out `tracks` to only include tracks that fall under *pop* music. 

In [52]:
# 2664 unique genres in total
genres_list = list(genres.genres)
print(len(genres_list))
genres_list

2664


['432hz',
 '[]',
 'a cappella',
 'abstract',
 'abstract beats',
 'abstract hip hop',
 'accordeon',
 'accordion',
 'acid house',
 'acid jazz',
 'acid rock',
 'acid trance',
 'acousmatic',
 'acoustic blues',
 'acoustic pop',
 'acoustic punk',
 'adoracion',
 'adult standards',
 'adventista',
 'afghan pop',
 'african gospel',
 'african percussion',
 'african reggae',
 'african rock',
 'afro dancehall',
 'afro house',
 'afro psych',
 'afro-cuban traditional',
 'afro-funk',
 'afrobeat',
 'afrobeat brasileiro',
 'afrofuturism',
 'afrofuturismo brasileiro',
 'afropop',
 'afroswing',
 'aggrotech',
 'alabama indie',
 'alabama metal',
 'alabama rap',
 'albanian pop',
 'alberta country',
 'alberta hip hop',
 'album rock',
 'albuquerque indie',
 'alt-idol',
 'alternative americana',
 'alternative country',
 'alternative dance',
 'alternative emo',
 'alternative hip hop',
 'alternative metal',
 'alternative pop',
 'alternative pop rock',
 'alternative r&b',
 'alternative rock',
 'alternative roots r

In [53]:
# extract all subgenres a pop
pop_genres = [g for g in genres_list if 'pop' in g]

In [54]:
# 275 subgenres of pop
print(len(pop_genres))
pop_genres

275


['acoustic pop',
 'afghan pop',
 'afropop',
 'albanian pop',
 'alternative pop',
 'alternative pop rock',
 'ambient pop',
 'antiviral pop',
 'arab pop',
 'art pop',
 'australian alternative pop',
 'australian electropop',
 'australian pop',
 'austrian pop',
 'bahamian pop',
 'baile pop',
 'barbadian pop',
 'baroque pop',
 'bedroom pop',
 'belarusian pop',
 'belgian pop',
 'beninese pop',
 'bitpop',
 'bow pop',
 'boy pop',
 'brill building pop',
 'britpop',
 'bubblegum pop',
 'c-pop',
 'canadian electropop',
 'canadian pop',
 'canadian pop punk',
 'candy pop',
 'canto popular uruguayo',
 'cantopop',
 'chamber pop',
 'channel pop',
 'chicago pop punk',
 'chinese electropop',
 'chinese idol pop',
 'christian pop',
 'classic arab pop',
 'classic belgian pop',
 'classic cantopop',
 'classic colombian pop',
 'classic country pop',
 'classic czech pop',
 'classic danish pop',
 'classic finnish pop',
 'classic french pop',
 'classic greek pop',
 'classic icelandic pop',
 'classic israeli pop',

There are 275 subgenres of *pop* music. Interestingly, however, a lot of these subgenres are descriptors of locations like 'german electropop.' It takes the subgenre and further divides it according to region. I wonder if it worth being this granular? Should I stop differentiating at a certain point? How can I decide when to do this?


## Track Data Exploration ##

In [223]:
# start by narrowing tracks from the last 50 years (1970-2020)
df = tracks[tracks.year >= 1970]
print(df.shape)
df.head()

(101656, 19)


Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
3054,0.511,['Elton John'],0.436,380667,0.429,0,2AgMTn4RyRrI1uc3iXx4Xq,0.000103,3,0.104,-10.884,1,My Father's Gun,29,1970-10-30,0.0319,119.025,0.223,1970
3055,0.465,['Three Dog Night'],0.629,166707,0.376,0,2hKxnhi2Eb3443AQbOnqNl,0.0,7,0.235,-13.934,1,It Ain't Easy,26,1970-01-01,0.0337,76.848,0.347,1970
3056,0.166,['Martha Reeves & The Vandellas'],0.417,176573,0.753,0,2nxSAQBvF6gDIwZmG6B9nO,0.0,9,0.269,-10.204,0,I Should Be Proud,33,1970,0.126,178.408,0.794,1970
3057,0.99,['Vashti Bunyan'],0.6,141893,0.107,0,2vNuzzaFEPY2WGdYFhdeui,0.0114,0,0.101,-19.189,0,Where I Like To Stand,26,1970,0.0525,117.63,0.732,1970
3058,0.639,['Ry Cooder'],0.568,134067,0.434,0,439kdOcVhEp5gQ67eYNkKG,0.626,7,0.229,-14.17,1,Available Space,35,1970-01-01,0.0379,147.388,0.694,1970


`tracks` does not list the genre of each track. In order to get genre for each track I would have to make an API call to Spotify since genres are only listed as 'Artist' data. Each artist has a list of tracks that are associated with them. If I then merge those genres for a given track, how do I decided the primary genre of that track?

## Grabbing Genre Data ##

### Practice ###

In [83]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [84]:
# api credentials
spotify_creds = SpotifyClientCredentials(client_id='d81a95eae1b9423385df789736549e80', 
                                         client_secret='fdfc7d6877fc485a8ce5be3f2ab2e056')

# instatiate call object
sp = spotipy.Spotify(client_credentials_manager=spotify_creds)

In [185]:
# I can only make 50 calls at a time so I'm going to need to find a way to iterate in chunks of 50
track_ids = list(df.id)

**Get Artists IDs from Track IDs**

In [184]:
# track data for first 50 tracks
first_50 = track_ids[0:50]
test_call = sp.tracks(first_50, market='US')    

In [110]:
# how to extract artist id from track results
test_call['tracks'][49]['artists'][0]['id']

'22bE4uQ6baNwSHPVcDxLCe'

In [112]:
# list of artist ids corresponding to each track
test_artists = []
for i in range(len(test_call['tracks'])):
    test_artists.append(test_call['tracks'][i]['artists'][0]['id'])

**Get Genres from Track IDs**

In [114]:
# artist data for first 50 tracks
test_call2 = sp.artists(test_artists)

In [121]:
# how to extract genres from artist results
test_call2['artists'][0]['genres']

['glam rock', 'mellow gold', 'piano rock', 'soft rock']

In [125]:
# list of genres for each artist
test_genres = []
for i in range(len(test_call2['artists'])):
    test_genres.append(test_call2['artists'][i]['genres'])

**Create DataFrame that contains TrackID, ArtistID, Genres**

In [169]:
test_dict = {'track_id' : first_50,
             'artist_id': test_artists,
             'genres' : test_genres}
pd.DataFrame(test_dict).head()

Unnamed: 0,track_id,artist_id,genres
0,2AgMTn4RyRrI1uc3iXx4Xq,3PhoLpVuITZKcymswpck5b,"[glam rock, mellow gold, piano rock, soft rock]"
1,2hKxnhi2Eb3443AQbOnqNl,4FAEZeJcsYYBkNq2D3KGTV,"[album rock, art rock, blues rock, brill build..."
2,2nxSAQBvF6gDIwZmG6B9nO,1Pe5hlKMCTULjosqZ6KanP,"[brill building pop, classic girl group, class..."
3,2vNuzzaFEPY2WGdYFhdeui,4chuPfKtATDZvbRLExsTp2,"[anti-folk, british folk, folk, freak folk, ne..."
4,439kdOcVhEp5gQ67eYNkKG,1CPwHx5lgVxv0rfcp7UXLx,"[blues, blues rock, country blues, country roc..."


### Building the Genre DataFrame ###

In [186]:
# divide track_ids list into chunks of 50
id_chunks = [track_ids[x:x+50] for x in range(0, len(track_ids), 50)]

In [206]:
# get artist ids for all tracks
artist_ids = []
for x in tqdm(range(len(id_chunks))):
    results = sp.tracks(id_chunks[x], market='US')
    for i in range(len(results['tracks'])):
        artist_ids.append(results['tracks'][i]['artists'][0]['id'])

100%|██████████| 2034/2034 [03:19<00:00, 10.19it/s]


In [209]:
# divide artist_ids into chunks of 50
artist_chunks = [artist_ids[x:x+50] for x in range(0, len(artist_ids), 50)]

In [221]:
# get genres for all artist
genres = []
for x in tqdm(range(len(artist_chunks))):
    results = sp.artists(artist_chunks[x])
    for i in range(len(results['artists'])):
        genres.append(results['artists'][i]['genres'])

100%|██████████| 2034/2034 [03:17<00:00, 10.28it/s]


In [291]:
# turn fetched info into dataframe to be merged with original
genre_dict = {'track_id' : track_ids,
              'artist_id' : artist_ids,
              'genre' : genres}
genre_df = pd.DataFrame(genre_dict)

In [292]:
print(genre_df.shape)
genre_df.head()

(101656, 3)


Unnamed: 0,track_id,artist_id,genre
0,2AgMTn4RyRrI1uc3iXx4Xq,3PhoLpVuITZKcymswpck5b,"[glam rock, mellow gold, piano rock, soft rock]"
1,2hKxnhi2Eb3443AQbOnqNl,4FAEZeJcsYYBkNq2D3KGTV,"[album rock, art rock, blues rock, brill build..."
2,2nxSAQBvF6gDIwZmG6B9nO,1Pe5hlKMCTULjosqZ6KanP,"[brill building pop, classic girl group, class..."
3,2vNuzzaFEPY2WGdYFhdeui,4chuPfKtATDZvbRLExsTp2,"[anti-folk, british folk, folk, freak folk, ne..."
4,439kdOcVhEp5gQ67eYNkKG,1CPwHx5lgVxv0rfcp7UXLx,"[blues, blues rock, country blues, country roc..."


In [293]:
df_final = df.merge(genre_df, how='left', left_on='id', right_on='track_id')
df_final.drop('id', axis=1, inplace=True)
df_final.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,track_id,artist_id,genre
0,0.511,['Elton John'],0.436,380667,0.429,0,0.000103,3,0.104,-10.884,1,My Father's Gun,29,1970-10-30,0.0319,119.025,0.223,1970,2AgMTn4RyRrI1uc3iXx4Xq,3PhoLpVuITZKcymswpck5b,"[glam rock, mellow gold, piano rock, soft rock]"
1,0.465,['Three Dog Night'],0.629,166707,0.376,0,0.0,7,0.235,-13.934,1,It Ain't Easy,26,1970-01-01,0.0337,76.848,0.347,1970,2hKxnhi2Eb3443AQbOnqNl,4FAEZeJcsYYBkNq2D3KGTV,"[album rock, art rock, blues rock, brill build..."
2,0.166,['Martha Reeves & The Vandellas'],0.417,176573,0.753,0,0.0,9,0.269,-10.204,0,I Should Be Proud,33,1970,0.126,178.408,0.794,1970,2nxSAQBvF6gDIwZmG6B9nO,1Pe5hlKMCTULjosqZ6KanP,"[brill building pop, classic girl group, class..."
3,0.99,['Vashti Bunyan'],0.6,141893,0.107,0,0.0114,0,0.101,-19.189,0,Where I Like To Stand,26,1970,0.0525,117.63,0.732,1970,2vNuzzaFEPY2WGdYFhdeui,4chuPfKtATDZvbRLExsTp2,"[anti-folk, british folk, folk, freak folk, ne..."
4,0.639,['Ry Cooder'],0.568,134067,0.434,0,0.626,7,0.229,-14.17,1,Available Space,35,1970-01-01,0.0379,147.388,0.694,1970,439kdOcVhEp5gQ67eYNkKG,1CPwHx5lgVxv0rfcp7UXLx,"[blues, blues rock, country blues, country roc..."


## Cleaning Up the DataFrame ##

Now that we have a the genre associated for each artist of each track, we want to create a dataframe that only include tracks that are a subgenre of pop.

In [298]:
# create a function that returns true if a pop sub_genre is contained in a track's genre list
def is_pop(g_list):
    return any(item in g_list for item in pop_genres)

In [299]:
# apply function
df_final['is_pop'] = df_final['genre'].apply(is_pop)

In [300]:
df_final

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,track_id,artist_id,genre,is_pop
0,0.5110,['Elton John'],0.436,380667,0.429,0,0.000103,3,0.1040,-10.884,1,My Father's Gun,29,1970-10-30,0.0319,119.025,0.223,1970,2AgMTn4RyRrI1uc3iXx4Xq,3PhoLpVuITZKcymswpck5b,"[glam rock, mellow gold, piano rock, soft rock]",False
1,0.4650,['Three Dog Night'],0.629,166707,0.376,0,0.000000,7,0.2350,-13.934,1,It Ain't Easy,26,1970-01-01,0.0337,76.848,0.347,1970,2hKxnhi2Eb3443AQbOnqNl,4FAEZeJcsYYBkNq2D3KGTV,"[album rock, art rock, blues rock, brill build...",True
2,0.1660,['Martha Reeves & The Vandellas'],0.417,176573,0.753,0,0.000000,9,0.2690,-10.204,0,I Should Be Proud,33,1970,0.1260,178.408,0.794,1970,2nxSAQBvF6gDIwZmG6B9nO,1Pe5hlKMCTULjosqZ6KanP,"[brill building pop, classic girl group, class...",True
3,0.9900,['Vashti Bunyan'],0.600,141893,0.107,0,0.011400,0,0.1010,-19.189,0,Where I Like To Stand,26,1970,0.0525,117.630,0.732,1970,2vNuzzaFEPY2WGdYFhdeui,4chuPfKtATDZvbRLExsTp2,"[anti-folk, british folk, folk, freak folk, ne...",False
4,0.6390,['Ry Cooder'],0.568,134067,0.434,0,0.626000,7,0.2290,-14.170,1,Available Space,35,1970-01-01,0.0379,147.388,0.694,1970,439kdOcVhEp5gQ67eYNkKG,1CPwHx5lgVxv0rfcp7UXLx,"[blues, blues rock, country blues, country roc...",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101651,0.1730,"['DripReport', 'Tyga']",0.875,163800,0.443,1,0.000032,1,0.0891,-7.461,1,Skechers (feat. Tyga) - Remix,75,2020-05-15,0.1430,100.012,0.306,2020,4KppkflX7I3vJQk7urOJaS,3O5HD95HTEPgoPFOjAb7yV,[viral rap],False
101652,0.0167,"['Leon Bridges', 'Terrace Martin']",0.719,167468,0.385,0,0.031300,8,0.1110,-10.907,1,Sweeter (feat. Terrace Martin),64,2020-06-08,0.0403,128.000,0.270,2020,1ehhGlTvjtHo2e4xJFB0SZ,3qnGvpP8Yth1AqSBMqON5x,"[modern blues, soul]",False
101653,0.5380,"['Kygo', 'Oh Wonder']",0.514,180700,0.539,0,0.002330,7,0.1080,-9.332,1,How Would I Know,70,2020-05-29,0.1050,123.700,0.153,2020,52eycxprLhK3lPcRLbQiVk,23fqKkggKUBHNkbKtXEls4,"[edm, pop, tropical house]",True
101654,0.0714,"['Cash Cash', 'Andy Grammer']",0.646,167308,0.761,0,0.000000,1,0.2220,-2.557,1,I Found You,70,2020-02-28,0.0385,129.916,0.472,2020,3wYOGJYD31sLRmBgCvWxa4,1LOB7jTeEV14pHai6EXSzF,"[big room, dance pop, edm, electro house, elec...",True


In [303]:
pop_df = df_final[df_final.is_pop == True]
print(pop_df.shape)
pop_df.tail()

(36451, 22)


Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,track_id,artist_id,genre,is_pop
101648,0.264,"['Meek Mill', 'Roddy Ricch']",0.744,167845,0.702,1,0.0,7,0.12,-6.255,0,Letter To Nipsey (feat. Roddy Ricch),66,2020-01-27,0.288,91.885,0.338,2020,0j2CNrgtalXRGIvHMO2vzh,20sxb77xiYeusSH8cVdatc,"[hip hop, philly rap, pop rap, rap, southern h...",True
101649,0.0227,"['Trey Songz', 'Summer Walker']",0.619,194576,0.719,1,0.0,0,0.0839,-4.111,1,Back Home (feat. Summer Walker),69,2020-04-29,0.157,86.036,0.351,2020,5QZ11AHm7xiytOGXGlxQi5,2iojnBLj0qIMiKPvVhLnsH,"[dance pop, pop, r&b, urban contemporary]",True
101653,0.538,"['Kygo', 'Oh Wonder']",0.514,180700,0.539,0,0.00233,7,0.108,-9.332,1,How Would I Know,70,2020-05-29,0.105,123.7,0.153,2020,52eycxprLhK3lPcRLbQiVk,23fqKkggKUBHNkbKtXEls4,"[edm, pop, tropical house]",True
101654,0.0714,"['Cash Cash', 'Andy Grammer']",0.646,167308,0.761,0,0.0,1,0.222,-2.557,1,I Found You,70,2020-02-28,0.0385,129.916,0.472,2020,3wYOGJYD31sLRmBgCvWxa4,1LOB7jTeEV14pHai6EXSzF,"[big room, dance pop, edm, electro house, elec...",True
101655,0.109,['Ingrid Andress'],0.512,214787,0.428,0,0.0,0,0.105,-7.387,1,More Hearts Than Mine,65,2020-03-27,0.0271,80.588,0.366,2020,60RFlt48hm0l4Fu0JoccOl,0jPnVIasXzBYjrlpO5irii,"[contemporary country, country pop]",True


In [313]:
# export to csv
pop_df.to_csv('data/pop_music.csv')

In [315]:
# export to csv
df_final.to_csv('data/music_with_genres.csv')