In [1]:
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import os
from dotenv import load_dotenv, find_dotenv

import time

#get login details from .env file
load_dotenv(find_dotenv())
cid = os.getenv('SPOTIFY_CLIENT_ID')
csecret = os.getenv('SPOTIFY_CLIENT_SECRET')


In [2]:
#For authenticating requests
auth_manager = SpotifyClientCredentials(client_id = cid, client_secret = csecret)
sp = spotipy.Spotify(auth_manager=auth_manager, requests_timeout=10, retries=10)

In [3]:
##################### Load URI files and extract the important values
artist = pd.read_csv('artist_values.csv')
artist_ser = artist[['artistname']]

uri = pd.read_csv('uri_values.csv')
uri_ser = uri['URI']

#Load the combined_df file for adding the genres and deleting specific tracks later
combined_df = pd.read_csv('combined_data.csv', index_col=0)
combined_df2 = combined_df.copy()

In [4]:
#Divide uri_ser into groups of 100 or less for batch requests
uri100 = [uri_ser[i:i+100] for i in range(0,len(uri_ser)+1,100)]

uri100

[0     spotify:track:4PMxOmJY2dEXFb9CcivqHT
 1     spotify:track:4iLqG9SeJSnt0cSPICSjxv
 2     spotify:track:3JvrhDOgAt6p7K8mDyZwRd
 3     spotify:track:41KPpw0EZCytxNkmEMJVgr
 4     spotify:track:1huvTbEYtgltjQRXzrNKGi
                       ...                 
 95    spotify:track:2aQigVWRPUU4NB9jmdyACD
 96    spotify:track:0b6wdul3A5sQNpIOv03OxP
 97    spotify:track:3cIZhFW59rOxlqPfQ6bFtP
 98    spotify:track:40mYZdW4XYsfVU4D1L3cXx
 99    spotify:track:6ooluO7DiEhI1zmK94nRCM
 Name: URI, Length: 100, dtype: object,
 100    spotify:track:5OUTFH5acycdnf8OVo21Gv
 101    spotify:track:02t0a1rmItzyIhNNThYRcG
 102    spotify:track:0mv9yj1mjcEJes46dHDib5
 103    spotify:track:7fKtTaMeWM4U8ibHVnlUkM
 104    spotify:track:2uoWHninMACac1KBFn8rXQ
                        ...                 
 195    spotify:track:6u7ipGHDJsBA3UOwuz0W2T
 196    spotify:track:54z9tiaPu15pfkGSUwrOLI
 197    spotify:track:1qXjolYMICsnISl8xZAbA2
 198    spotify:track:6SznMYPQJHzr3IwsAHLId2
 199    spotify:track:1eCl

In [5]:
#Extract features regarding the tracks
track_result_df = pd.DataFrame()
skipped_tracks = list()

#Loop through each group of 100 tracks and call the audio_features() function
for i,track_val in enumerate(uri100):
    print(i+1, 'out of', len(uri100))
    track_ids = track_val
    result = sp.audio_features([track_ids][0])
    
    #Try to concatenate the resulting df into the initialized track_result_df
    #If an error occurs, perform each function on the tracks in the group individually
    #If an error still occurs, store the track ID in skipped_tracks for later review
    try:
        result_df = pd.DataFrame(result)
        track_result_df = pd.concat([track_result_df, result_df], axis=0)
        print('done')
        
    except:
        print('INDIVIDUAL TRACK REQUESTS')
        
        for j,track in enumerate(track_val):
            ind = i*100 + j
            print('Group {}: {} out of {} - Track: {}'.format(i+1, j+1, len(track_val), track))
            try:
                result = pd.DataFrame(sp.audio_features([track][0]), index=[ind])
                track_result_df = pd.concat([track_result_df, result], axis=0)
                print('done')
            except:
                print('Error: Track details not saved')
                skipped_tracks.append(track)
                print('skipped')


1 out of 98
done
2 out of 98
done
3 out of 98
done
4 out of 98
done
5 out of 98
done
6 out of 98
done
7 out of 98
done
8 out of 98
done
9 out of 98
done
10 out of 98
done
11 out of 98
done
12 out of 98
done
13 out of 98
done
14 out of 98
done
15 out of 98
done
16 out of 98
done
17 out of 98
done
18 out of 98
done
19 out of 98
done
20 out of 98
done
21 out of 98
done
22 out of 98
done
23 out of 98
done
24 out of 98
done
25 out of 98
done
26 out of 98
done
27 out of 98
done
28 out of 98
done
29 out of 98
done
30 out of 98
done
31 out of 98
done
32 out of 98
done
33 out of 98
done
34 out of 98
done
35 out of 98
done
36 out of 98
done
37 out of 98
done
38 out of 98
done
39 out of 98
done
40 out of 98
done
41 out of 98
done
42 out of 98
done
43 out of 98
done
44 out of 98
done
45 out of 98
done
46 out of 98
done
47 out of 98
done
48 out of 98
done
49 out of 98
done
50 out of 98
done
51 out of 98
done
52 out of 98
done
53 out of 98
done
54 out of 98
done
55 out of 98
done
56 out of 98
done
5

In [6]:
#Make a copy of the df for debugging purposes
track_result_df1 = track_result_df.copy()
track_result_df1 = track_result_df1.reset_index().drop(columns = ['index',0])
track_result_df1 = pd.concat([track_result_df1, uri['artistname']], axis=1)

track_result_df1

Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence,artistname
0,0.12700,https://api.spotify.com/v1/audio-analysis/4PMx...,0.678,196213.0,0.899,4PMxOmJY2dEXFb9CcivqHT,0.000000,1.0,0.0730,-3.436,0.0,0.2770,110.070,4.0,https://api.spotify.com/v1/tracks/4PMxOmJY2dEX...,audio_features,spotify:track:4PMxOmJY2dEXFb9CcivqHT,0.7650,VERIVERY
1,0.09690,https://api.spotify.com/v1/audio-analysis/4iLq...,0.774,211475.0,0.626,4iLqG9SeJSnt0cSPICSjxv,0.000031,3.0,0.0848,-4.432,0.0,0.0432,100.041,4.0,https://api.spotify.com/v1/tracks/4iLqG9SeJSnt...,audio_features,spotify:track:4iLqG9SeJSnt0cSPICSjxv,0.7770,Charlie Puth
2,0.43100,https://api.spotify.com/v1/audio-analysis/3Jvr...,0.484,204280.0,0.731,3JvrhDOgAt6p7K8mDyZwRd,0.000000,1.0,0.1510,-6.694,1.0,0.0379,101.654,4.0,https://api.spotify.com/v1/tracks/3JvrhDOgAt6p...,audio_features,spotify:track:3JvrhDOgAt6p7K8mDyZwRd,0.5100,Vance Joy
3,0.00750,https://api.spotify.com/v1/audio-analysis/41KP...,0.802,169920.0,0.781,41KPpw0EZCytxNkmEMJVgr,0.825000,0.0,0.1470,-6.564,1.0,0.0368,125.026,4.0,https://api.spotify.com/v1/tracks/41KPpw0EZCyt...,audio_features,spotify:track:41KPpw0EZCytxNkmEMJVgr,0.6220,Swedish House Mafia
4,0.01130,https://api.spotify.com/v1/audio-analysis/1huv...,0.806,219840.0,0.608,1huvTbEYtgltjQRXzrNKGi,0.000000,1.0,0.6350,-7.008,1.0,0.0659,95.049,4.0,https://api.spotify.com/v1/tracks/1huvTbEYtglt...,audio_features,spotify:track:1huvTbEYtgltjQRXzrNKGi,0.8490,Ed Sheeran
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9751,0.40400,https://api.spotify.com/v1/audio-analysis/6kty...,0.564,175100.0,0.671,6ktyYQc64vYADA2hoatSmp,0.000000,6.0,0.6690,-4.676,1.0,0.0998,176.016,4.0,https://api.spotify.com/v1/tracks/6ktyYQc64vYA...,audio_features,spotify:track:6ktyYQc64vYADA2hoatSmp,0.6260,Ingrid Michaelson
9752,0.03130,https://api.spotify.com/v1/audio-analysis/03wv...,0.753,199653.0,0.815,03wvlNaxFKB9sge4U6vyPE,0.000002,5.0,0.1910,-4.857,1.0,0.0613,106.998,4.0,https://api.spotify.com/v1/tracks/03wvlNaxFKB9...,audio_features,spotify:track:03wvlNaxFKB9sge4U6vyPE,0.5670,TWICE
9753,0.88200,https://api.spotify.com/v1/audio-analysis/4LWd...,0.539,129600.0,0.579,4LWdKeliRPxiFLoLFN4X1B,0.941000,2.0,0.1510,-11.480,0.0,0.0310,100.010,4.0,https://api.spotify.com/v1/tracks/4LWdKeliRPxi...,audio_features,spotify:track:4LWdKeliRPxiFLoLFN4X1B,0.4750,Darren Korb
9754,0.02420,https://api.spotify.com/v1/audio-analysis/0jIV...,0.508,188473.0,0.572,0jIVoLsSYmMkfGnQyZANdK,0.000000,9.0,0.0982,-5.898,1.0,0.0575,128.121,4.0,https://api.spotify.com/v1/tracks/0jIVoLsSYmMk...,audio_features,spotify:track:0jIVoLsSYmMkfGnQyZANdK,0.1340,Arensky


In [7]:
#View any skipped track ids
skipped_tracks

[]

In [8]:
##########DEBUG: Check for duplicated ids in track
print('Number of duplicate tracks:', track_result_df1.duplicated(subset='id').sum())


Number of duplicate tracks: 0


In [9]:
#Format the track dataframe accordingly

track_result_df1 = track_result_df1.drop(['analysis_url',
                                         'track_href',
                                         'type',
                                         'uri'], axis=1)

track_result_df1

Unnamed: 0,acousticness,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,artistname
0,0.12700,0.678,196213.0,0.899,4PMxOmJY2dEXFb9CcivqHT,0.000000,1.0,0.0730,-3.436,0.0,0.2770,110.070,4.0,0.7650,VERIVERY
1,0.09690,0.774,211475.0,0.626,4iLqG9SeJSnt0cSPICSjxv,0.000031,3.0,0.0848,-4.432,0.0,0.0432,100.041,4.0,0.7770,Charlie Puth
2,0.43100,0.484,204280.0,0.731,3JvrhDOgAt6p7K8mDyZwRd,0.000000,1.0,0.1510,-6.694,1.0,0.0379,101.654,4.0,0.5100,Vance Joy
3,0.00750,0.802,169920.0,0.781,41KPpw0EZCytxNkmEMJVgr,0.825000,0.0,0.1470,-6.564,1.0,0.0368,125.026,4.0,0.6220,Swedish House Mafia
4,0.01130,0.806,219840.0,0.608,1huvTbEYtgltjQRXzrNKGi,0.000000,1.0,0.6350,-7.008,1.0,0.0659,95.049,4.0,0.8490,Ed Sheeran
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9751,0.40400,0.564,175100.0,0.671,6ktyYQc64vYADA2hoatSmp,0.000000,6.0,0.6690,-4.676,1.0,0.0998,176.016,4.0,0.6260,Ingrid Michaelson
9752,0.03130,0.753,199653.0,0.815,03wvlNaxFKB9sge4U6vyPE,0.000002,5.0,0.1910,-4.857,1.0,0.0613,106.998,4.0,0.5670,TWICE
9753,0.88200,0.539,129600.0,0.579,4LWdKeliRPxiFLoLFN4X1B,0.941000,2.0,0.1510,-11.480,0.0,0.0310,100.010,4.0,0.4750,Darren Korb
9754,0.02420,0.508,188473.0,0.572,0jIVoLsSYmMkfGnQyZANdK,0.000000,9.0,0.0982,-5.898,1.0,0.0575,128.121,4.0,0.1340,Arensky


In [10]:
#####################ARTIST INFORMATION

#Verify there are no duplicate artist names in the set
artist_ser.duplicated().sum()

0

In [11]:
#Initialize the final dataframe
artist_info_df = pd.DataFrame()
skipped_artists = list()

#Search Spotify database with the artist name and return all results
#Parse through the results in order to find the one with the matching input artist name
#Extract the items from the selected search results and concatenate with artist_info_df
#######If any artists result in errors, saved them to skipped_artists

for i,artist in enumerate(artist_ser['artistname']):
    #Pause for a few seconds to avoid rate limiting every 50 searches
    if i % 50 == 0:
        time.sleep(5)
        
    print(i,artist)
    temp = sp.search(artist, type='artist')

    try:
        #Ensure that the correct search result is chosen to match the input artist
        for i,option in enumerate(temp['artists']['items']):
            if artist.lower() == option['name'].lower():
                correct_ind = i
                break
                
        print('RESULT:', temp['artists']['items'][correct_ind]['name'])
        
        artist_dict = temp['artists']['items'][correct_ind]
        row_concat = pd.concat([pd.DataFrame({'artistname':artist}, index = [0]), 
                                pd.json_normalize(artist_dict)], axis=1)
        artist_info_df = pd.concat([artist_info_df, row_concat], axis=0)
        print('done')
    
    except:
        skipped_artists.append(artist)
        print('############################################################################################# {}: {} skipped'.format(i, artist))
    


        
artist_info_df

0 VERIVERY
RESULT: VERIVERY
done
1 Charlie Puth
RESULT: Charlie Puth
done
2 Vance Joy
RESULT: Vance Joy
done
3 Swedish House Mafia
RESULT: Swedish House Mafia
done
4 Ed Sheeran
RESULT: Ed Sheeran
done
5 Louis The Child
RESULT: Louis The Child
done
6 Glass Animals
RESULT: Glass Animals
done
7 ILLENIUM
RESULT: ILLENIUM
done
8 Tinie Tempah
RESULT: Tinie Tempah
done
9 Gallant
RESULT: Gallant
done
10 BTS
RESULT: BTS
done
11 Sara Bareilles
RESULT: Sara Bareilles
done
12 Unknown Brain
RESULT: Unknown Brain
done
13 STAYC
RESULT: STAYC
done
14 Eric Nam
RESULT: Eric Nam
done
15 Nature Recordings
RESULT: Nature Recordings
done
16 Gryffin
RESULT: Gryffin
done
17 Zedd
RESULT: Zedd
done
18 Shura
RESULT: Shura
done
19 Jay Park
RESULT: Jay Park
done
20 Omarion
RESULT: Omarion
done
21 Steve Angello
RESULT: Steve Angello
done
22 Dog Blood
RESULT: Dog Blood
done
23 Yoo Jae Seok
RESULT: Yoo Jae Seok
done
24 Nicky Romero
RESULT: Nicky Romero
done
25 Chance the Rapper
RESULT: Chance the Rapper
done
26 Franc

Unnamed: 0,artistname,genres,href,id,images,name,popularity,type,uri,external_urls.spotify,followers.href,followers.total
0,VERIVERY,"[k-pop, k-pop boy group]",https://api.spotify.com/v1/artists/1fWUcRSok57...,1fWUcRSok57yRm8gPKj1Fc,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",VERIVERY,49,artist,spotify:artist:1fWUcRSok57yRm8gPKj1Fc,https://open.spotify.com/artist/1fWUcRSok57yRm...,,291598
0,Charlie Puth,"[dance pop, pop, viral pop]",https://api.spotify.com/v1/artists/6VuMaDnrHyP...,6VuMaDnrHyPL1p4EHjYLi7,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Charlie Puth,82,artist,spotify:artist:6VuMaDnrHyPL1p4EHjYLi7,https://open.spotify.com/artist/6VuMaDnrHyPL1p...,,16074801
0,Vance Joy,"[folk-pop, modern rock, pop, pop rock]",https://api.spotify.com/v1/artists/10exVja0key...,10exVja0key0uqUkk6LJRT,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Vance Joy,76,artist,spotify:artist:10exVja0key0uqUkk6LJRT,https://open.spotify.com/artist/10exVja0key0uq...,,2608487
0,Swedish House Mafia,"[edm, electro house, pop, pop dance, progressi...",https://api.spotify.com/v1/artists/1h6Cn3P4NGz...,1h6Cn3P4NGzXbaXidqURXs,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Swedish House Mafia,77,artist,spotify:artist:1h6Cn3P4NGzXbaXidqURXs,https://open.spotify.com/artist/1h6Cn3P4NGzXba...,,3558766
0,Ed Sheeran,"[pop, uk pop]",https://api.spotify.com/v1/artists/6eUKZXaKkcv...,6eUKZXaKkcviH0Ku9w2n3V,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Ed Sheeran,93,artist,spotify:artist:6eUKZXaKkcviH0Ku9w2n3V,https://open.spotify.com/artist/6eUKZXaKkcviH0...,,96929739
...,...,...,...,...,...,...,...,...,...,...,...,...
0,The Quotes,[],https://api.spotify.com/v1/artists/4NCIn5OqFte...,4NCIn5OqFteCOafSpZQCHj,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",The Quotes,0,artist,spotify:artist:4NCIn5OqFteCOafSpZQCHj,https://open.spotify.com/artist/4NCIn5OqFteCOa...,,135
0,Stormzy,"[grime, uk hip hop]",https://api.spotify.com/v1/artists/2SrSdSvpmin...,2SrSdSvpminqmStGELCSNd,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Stormzy,73,artist,spotify:artist:2SrSdSvpminqmStGELCSNd,https://open.spotify.com/artist/2SrSdSvpminqmS...,,2485154
0,VARGENTA,[],https://api.spotify.com/v1/artists/4hg56haGFxx...,4hg56haGFxxnKEt3AkW3fo,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",VARGENTA,54,artist,spotify:artist:4hg56haGFxxnKEt3AkW3fo,https://open.spotify.com/artist/4hg56haGFxxnKE...,,6743
0,Arensky,[],https://api.spotify.com/v1/artists/4FOWaU8YpEU...,4FOWaU8YpEU4SsLMMxVeza,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Arensky,37,artist,spotify:artist:4FOWaU8YpEU4SsLMMxVeza,https://open.spotify.com/artist/4FOWaU8YpEU4Ss...,,5285


In [92]:
#Make a copy of the df for debugging purposes
artist_info_df1 = artist_info_df.copy()
track_result_df2 = track_result_df1.copy()


In [93]:
#Checking for skipped artists
print('Input artists:', len(artist_ser))
print('Output artists:', len(artist_info_df))

print('{} artists were added to the skip list\n'.format(len(skipped_artists)))
for i,artist in enumerate(skipped_artists):
    print('{}: {}'.format(i+1,artist))

print('\nInput({}) - Output({}) = {}'.format(len(artist_ser), len(artist_info_df), len(artist_ser)-len(artist_info_df)))

#Delete artists from the combined list - the artist list already does not have them so no action needed
numskip = list(combined_df[combined_df['artistname'].isin(skipped_artists) == True].index)
combined_df2 = combined_df.drop(numskip)
print('combined_df length:  {} ----> {} (-{})'.format(len(combined_df), len(combined_df2), len(combined_df)-len(combined_df2)))

#Delete artists from the track list
numskip2 = list(track_result_df1[track_result_df1['artistname'].isin(skipped_artists) == True].index)
track_result_df2 = track_result_df1.drop(numskip2)
print('track_result length:  {} ----> {} (-{})'.format(len(track_result_df1), len(track_result_df2), len(track_result_df1)-len(track_result_df2)))


Input artists: 3073
Output artists: 3064
9 artists were added to the skip list

1: MM0100
2: Soul Nightcore
3: Liam Cacatian Thomassen
4: Your Daily Drive
5: Justin Zb
6: Zedd Rush
7: niki & cillia feat.flower
8: Ross-Emanuel Steven Bayeto
9: Jay Park & Ugly Duck

Input(3073) - Output(3064) = 9
combined_df length:  229285 ----> 229275 (-10)
track_result length:  9756 ----> 9747 (-9)


In [94]:
#Format the artist dataframe accordingly

artist_info_df1 = artist_info_df1.reset_index().drop(['index',
                                                     'href',
                                                     'images',
                                                     'type',
                                                     'uri',
                                                     'external_urls.spotify',
                                                     'followers.href'], axis = 1)

artist_info_df1 = artist_info_df1.rename(columns = {'followers.total':'followers'})

artist_info_df1

Unnamed: 0,artistname,genres,id,name,popularity,followers
0,VERIVERY,"[k-pop, k-pop boy group]",1fWUcRSok57yRm8gPKj1Fc,VERIVERY,49,291598
1,Charlie Puth,"[dance pop, pop, viral pop]",6VuMaDnrHyPL1p4EHjYLi7,Charlie Puth,82,16074801
2,Vance Joy,"[folk-pop, modern rock, pop, pop rock]",10exVja0key0uqUkk6LJRT,Vance Joy,76,2608487
3,Swedish House Mafia,"[edm, electro house, pop, pop dance, progressi...",1h6Cn3P4NGzXbaXidqURXs,Swedish House Mafia,77,3558766
4,Ed Sheeran,"[pop, uk pop]",6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,93,96929739
...,...,...,...,...,...,...
3059,The Quotes,[],4NCIn5OqFteCOafSpZQCHj,The Quotes,0,135
3060,Stormzy,"[grime, uk hip hop]",2SrSdSvpminqmStGELCSNd,Stormzy,73,2485154
3061,VARGENTA,[],4hg56haGFxxnKEt3AkW3fo,VARGENTA,54,6743
3062,Arensky,[],4FOWaU8YpEU4SsLMMxVeza,Arensky,37,5285


In [95]:
###########DEBUG: Check for the artistname mismatching with the name
wrong_artist_df = artist_info_df1[artist_info_df1['artistname'].str.lower() != artist_info_df1['name'].str.lower()] #EXCEPT if the only difference is capitalization
wrong_artist_ind = list(wrong_artist_df.index)
wrong_artist_list = list(wrong_artist_df['artistname'])

print('Number of mismatched artists in results:', len(wrong_artist_ind))
print('Indices:', wrong_artist_ind)

for i,val in enumerate(zip(artist_info_df1.loc[wrong_artist_ind]['artistname'], artist_info_df1.loc[wrong_artist_ind]['name'])):
    print('{}: INPUT[{}], OUTPUT[{}]'.format(i+1,val[0], val[1]))

#Delete the duplicate
artist_info_df2 = artist_info_df1.drop(wrong_artist_ind)
combined_df3 = combined_df2.drop(list(combined_df2[combined_df2['artistname'].isin(wrong_artist_list)].index))

#Delete duplicates by id (if capitalization was the only difference, then if the ids match, there definitely is a duplicate)
dup_artistinfo_index = artist_info_df2[ artist_info_df2['id'].duplicated() == True].index
artist_info_df2 = artist_info_df2.drop(dup_artistinfo_index)

print('\n{} rows deleted from combined_df'.format(len(combined_df2)-len(combined_df3)))

Number of mismatched artists in results: 18
Indices: [211, 1010, 1300, 1457, 1613, 1632, 1726, 1756, 1852, 2329, 2355, 2462, 2633, 2728, 2797, 2803, 2872, 2990]
1: INPUT[Blak], OUTPUT[Blake Shelton]
2: INPUT[Camero], OUTPUT[Cameron Boyce]
3: INPUT[Jaz], OUTPUT[Jazmine Sullivan]
4: INPUT[Triad], OUTPUT[Triada]
5: INPUT[John Jorge], OUTPUT[John Jorgenson]
6: INPUT[CHAN 찬], OUTPUT[Chance the Rapper]
7: INPUT[Charlie], OUTPUT[Charlie Puth]
8: INPUT[M Milli], OUTPUT[Flo Milli]
9: INPUT[HA], OUTPUT[Halsey]
10: INPUT[Gemini's], OUTPUT[Geminis]
11: INPUT[Perk], OUTPUT[Parker McCollum]
12: INPUT[Myk], OUTPUT[Myke Towers]
13: INPUT[Kast], OUTPUT[Kastra]
14: INPUT[STOR], OUTPUT[Stormzy]
15: INPUT[Johnson], OUTPUT[Cody Johnson]
16: INPUT[リアムMAZE1981], OUTPUT[Yung Bae]
17: INPUT[A.Keys], OUTPUT[Akeos]
18: INPUT[Warren], OUTPUT[Warren Zeiders]

334 rows deleted from combined_df


In [96]:
#########DEBUG: Check for empty genres - Set empty genres to "U"
artist_info_df3 = artist_info_df2.copy()
artist_info_nogenre = list(artist_info_df2[artist_info_df2['genres'].str.len() == 0].index)
print('Number of artists without any genres saved on Spotify:', len(artist_info_nogenre))

#Add count of genres
artist_info_df3.loc[:,'genre_num'] = artist_info_df3.genres.str.len()

#Fill any empty artist genres with "U"
artist_info_df3.loc[artist_info_nogenre, 'genres'] = 'U'

#Add a column to record the primary genre (the first one listed in the genres column)
artist_info_df3['maingenre'] = artist_info_df3.genres.str[0]


artist_info_df3

Number of artists without any genres saved on Spotify: 537


Unnamed: 0,artistname,genres,id,name,popularity,followers,genre_num,maingenre
0,VERIVERY,"[k-pop, k-pop boy group]",1fWUcRSok57yRm8gPKj1Fc,VERIVERY,49,291598,2,k-pop
1,Charlie Puth,"[dance pop, pop, viral pop]",6VuMaDnrHyPL1p4EHjYLi7,Charlie Puth,82,16074801,3,dance pop
2,Vance Joy,"[folk-pop, modern rock, pop, pop rock]",10exVja0key0uqUkk6LJRT,Vance Joy,76,2608487,4,folk-pop
3,Swedish House Mafia,"[edm, electro house, pop, pop dance, progressi...",1h6Cn3P4NGzXbaXidqURXs,Swedish House Mafia,77,3558766,5,edm
4,Ed Sheeran,"[pop, uk pop]",6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,93,96929739,2,pop
...,...,...,...,...,...,...,...,...
3059,The Quotes,U,4NCIn5OqFteCOafSpZQCHj,The Quotes,0,135,0,U
3060,Stormzy,"[grime, uk hip hop]",2SrSdSvpminqmStGELCSNd,Stormzy,73,2485154,2,grime
3061,VARGENTA,U,4hg56haGFxxnKEt3AkW3fo,VARGENTA,54,6743,0,U
3062,Arensky,U,4FOWaU8YpEU4SsLMMxVeza,Arensky,37,5285,0,U


In [97]:
################# NEW SECTION - Count the number of each individual genre instance per artist
#Create a new dataframe that encodes the genre column (like one hot encoder)
artist_info_df4 = artist_info_df3.copy()

artist_genre = artist_info_df3[['artistname','genres']]
artist_genre = artist_genre.join(artist_genre['genres'].str.join('|').str.get_dummies()).drop(columns = 'genres')
artist_genre = artist_genre.rename(columns = {'U':'none'})

#Then replace "U" with none in the artist dataframe as well
artist_info_df4.loc[artist_info_df4['maingenre'] == 'U', 'maingenre'] = 'none'

artist_info_df4.isna().sum()

artistname    0
genres        0
id            0
name          0
popularity    0
followers     0
genre_num     0
maingenre     0
dtype: int64

In [98]:
#Save a df containing all genres and their counts
genre_count = pd.DataFrame(artist_genre.drop(columns='artistname').apply(sum, axis=0)).rename(columns={0:'count'})

genre_count

Unnamed: 0,count
8-bit,1
none,537
a cappella,2
abstract beats,2
abstract hip hop,1
...,...
world worship,4
worship,6
yacht rock,4
zen,1


In [99]:
###Add the main genre to the track df and combined df

#Extract a series where the artist in the index and the genres are the values, then convert this to a dictionary
maingenre = artist_info_df4.copy().set_index('artistname')['maingenre'].reset_index()
artist_genre_dict = {}

for (artist,genre) in zip(maingenre['artistname'],maingenre['maingenre']):
    artist_genre_dict.update({artist:genre})
    
#Map the dictionary to the df
track_result_df2['genre'] = track_result_df2['artistname'].map(artist_genre_dict)

#Similarly, map the dictionary to the combined df
combined_df3['genre'] = combined_df3['artistname'].map(artist_genre_dict)


In [107]:
####### This section deletes sleep tracks and artists from the dataset which is a fairly personal issue - if these don't exist, then this will be ignored
combined_df4 = combined_df3.copy()
track_result_df3 = track_result_df2.copy()
artist_info_df5 = artist_info_df4.copy()
genre_count2 = genre_count.copy()


try:
    #Delete rows from all DFs pertaining to sleep music (No interest in this information)
    sleep_mus = ['environmental','asmr','rain','sleep','sound','water','meditation','singing bowl','shush',
                   'guided meditation', 'hypnosis','tone','sound effects','zen']

    #Specific sleep artists pulled from the dataframe that are not classified to a genre (and so sleep_mus misses them)
    sleep_artists_specific = ['Lluvia PQ','Winds and Oceans','Sky Fall SF', 'Thunderbound Productions','Natural Sound Selections',
                             'The Universal Monk']

    #Delete from genre list
    print('Genre Count')
    genre_count2 = genre_count.drop(sleep_mus, axis=0)#.reset_index(drop=True)
    print('{} ---> {}  ({} rows deleted)'.format(len(genre_count), len(genre_count2), len(genre_count)-len(genre_count2)))

    #Delete from artist df
    print('\nArtist DF')
    sleep_artists = artist_info_df4.loc[ artist_info_df4['maingenre'].isin(sleep_mus) ,]['artistname'] #Delete from sleep_mus
    artist_info_df5 = artist_info_df4.drop(sleep_artists.index).reset_index(drop=True)
    sleep_artists2 = artist_info_df5.loc[ artist_info_df5['artistname'].str.lower().isin(sleep_artists_specific) ,]['artistname'] #Delete from sleep_artists_specific
    artist_info_df6 = artist_info_df5.drop(sleep_artists2.index).reset_index(drop=True)
    print('Step 1: {} ---> {}  ({} rows deleted)'.format(len(artist_info_df4), len(artist_info_df5), len(artist_info_df4)-len(artist_info_df5)))
    print('Step 2: {} ---> {}  ({} rows deleted)'.format(len(artist_info_df5), len(artist_info_df6), len(artist_info_df5)-len(artist_info_df6)))

    #Delete from track df
    print('\nTrack DF')
    sleep_ids = track_result_df2.loc[ track_result_df2['artistname'].isin(list(sleep_artists)) ,]['id']
    track_result_df3 = track_result_df2.drop(sleep_ids.index).reset_index(drop=True)
    sleep_ids2 = track_result_df3.loc[ track_result_df3['artistname'].isin(list(sleep_artists_specific)) ,]['id']
    track_result_df4 = track_result_df3.drop(sleep_ids2.index).reset_index(drop=True)
    print('Step 1: {} ---> {}  ({} rows deleted)'.format(len(track_result_df2), len(track_result_df3), len(track_result_df2)-len(track_result_df3)))
    print('Step 2: {} ---> {}  ({} rows deleted)'.format(len(track_result_df3), len(track_result_df4), len(track_result_df3)-len(track_result_df4)))

    #Load combined_df and delete the ids from there
    #combined_df = pd.read_csv('combined_data.csv', index_col=0)
    print('\nCombined DF')
    sleep_artists_all = combined_df3.loc[ combined_df3['artistname'].isin(sleep_artists), ]['artistname']
    combined_df4 = combined_df3.drop(sleep_artists_all.index).reset_index(drop=True)
    sleep_artists_all = combined_df4.loc[ combined_df4['artistname'].isin(sleep_artists_specific), ]['artistname']
    combined_df5 = combined_df4.drop(sleep_artists_all.index).reset_index(drop=True)
    print('Step 1: {} ---> {}  ({} rows deleted)'.format(len(combined_df3), len(combined_df4), len(combined_df3)-len(combined_df4)))
    print('Step 2: {} ---> {}  ({} rows deleted)'.format(len(combined_df4), len(combined_df5), len(combined_df4)-len(combined_df5)))

    ###OUTPUT statistics
    print('\nNumber of sleep genres: {}'.format(len(sleep_mus)))
    print(sleep_mus)

    #print('\n')
    #print('Number of artists: {}'.format(len(sleep_artists)))
    #print('Number of songs: {}'.format(len(sleep_ids)))
    #print('\n')
    #print('Number of streams deleted: {}'.format(len(sleep_artists_all)))

    genre_count2.to_csv('genre_count.csv')
    artist_info_df6.to_csv('artist_info.csv')
    track_result_df4.to_csv('track_info.csv')
    combined_df5.to_csv('combined_df.csv')
    
except:
    
    genre_count.to_csv('genre_count.csv')
    artist_info_df4.to_csv('artist_info.csv')
    track_result_df2.to_csv('track_info.csv')
    combined_df2.to_csv('combined_df.csv')
    
    pass

#combined_df[ combined_df['artistname'].isin(sleep_artists_specific)]

Genre Count
1013 ---> 999  (14 rows deleted)

Artist DF
Step 1: 3044 ---> 2926  (118 rows deleted)
Step 2: 2926 ---> 2926  (0 rows deleted)

Track DF
Step 1: 9747 ---> 9483  (264 rows deleted)
Step 2: 9483 ---> 9472  (11 rows deleted)

Combined DF
Step 1: 228941 ---> 197432  (31509 rows deleted)
Step 2: 197432 ---> 194965  (2467 rows deleted)

Number of sleep genres: 14
['environmental', 'asmr', 'rain', 'sleep', 'sound', 'water', 'meditation', 'singing bowl', 'shush', 'guided meditation', 'hypnosis', 'tone', 'sound effects', 'zen']
