[Spotify API Search](https://developer.spotify.com/documentation/web-api/reference/search/search/)

In [2]:
from __future__ import print_function    # (at top of module)
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import time
import sys
import pandas as pd
import numpy as np
import re

In [3]:
%load_ext dotenv
%dotenv

In [4]:
import os
cid = os.environ.get('SPOTIPY_CLIENT_ID')
secret = os.environ.get('SPOTIPY_CLIENT_SECRET')
username = os.environ.get('SPOTIFY_USER_ID')
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)     
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [5]:
def users_to_playlists(users):
    playlists = []
    for user in users:
        try:
            for item in sp.user_playlists(user)['items']:
                try:
                    playlists.append({'user_id':user, 'playlist_id':item['id'], 'added_by_id':item['owner']['id']})
                except:
                    print('except playlist')
        except:
            print('except user')
        print('user_complete')
    return playlists


In [6]:
def parse_playlist_data(userID, playlistID, limit = 100, fields = 'items,total,next'):
    '''
    Used to get information about the tracks in spotify playlist
    
    Arguments:
    userID is a spotify user ID
    playlistID is a spotify playlist ID
    limit is the number of tracks to retrieve in each call. "Next" is used to get all songs. 100 is the max to
        retrieve at once
    fields is the fields to retrieve for each playlist call
    
    Returns:
    A 2D list where each row is a dictionary corresponding to a song
    the keys of this dictionary are: user_id, playlist_id, added_by_id, album_name,
                album_id, artist_names, artist_ids, track_name, track_id, date_added
    '''
    playlist = sp.user_playlist_tracks(userID,
                            playlist_id=playlistID,
                            fields = fields,
                            limit = limit)
    test_data_list = []
    for i in range(len(playlist['items'])):
        artist_name_list = []
        artist_id_list = []
        added_by_id = playlist['items'][i]['added_by']['id']
        added_at = playlist['items'][i]['added_at']
        curr = playlist['items'][i]['track']
        for j in range(len(curr['artists'])):
            artist_name_list.append(curr['artists'][j]['name'])
            artist_id_list.append(curr['artists'][j]['id'])
        track_name = curr['name']
        track_id = curr['id']
        album_name = curr['album']['name']
        album_id = curr['album']['id']
        test_data_list.append({'user_id': userID,
                               'playlist_id': playlistID,
                               'added_by_id': added_by_id,
                               'album_name': album_name,
                               'album_id': album_id,
                               'artist_names':artist_name_list,
                               'artist_ids':artist_id_list,
                               'track_name': track_name,
                               'track_id': track_id,
                               'date_added': added_at
                              })
    while playlist['next']:
        playlist = sp.next(playlist)
        for i in range(len(playlist['items'])):
            artist_name_list = []
            artist_id_list = []
            added_by_id = playlist['items'][i]['added_by']['id']
            added_at = playlist['items'][i]['added_at']
            curr = playlist['items'][i]['track']
            for j in range(len(curr['artists'])):
                artist_name_list.append(curr['artists'][j]['name'])
                artist_id_list.append(curr['artists'][j]['id'])
            track_name = curr['name']
            track_id = curr['id']
            album_name = curr['album']['name']
            album_id = curr['album']['id']
            test_data_list.append({'user_id': userID,
                                   'playlist_id': playlistID,
                                   'added_by_id': added_by_id,
                                   'album_name': album_name,
                                   'album_id': album_id,
                                   'artist_names':artist_name_list,
                                   'artist_ids':artist_id_list,
                                   'track_name': track_name,
                                   'track_id': track_id,
                                   'date_added': added_at
                                  })
    return test_data_list

In [22]:
df = pd.read_json('users_9_15_19.json')

In [23]:
df.shape

(428, 1)

In [24]:
users = list(df[0])

In [25]:
playlists = users_to_playlists(users)

user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_complete
user_c

In [34]:
df_playlists = pd.DataFrame(playlists)

In [36]:
df_playlists.shape

(8562, 3)

In [37]:
df_playlists.to_json('playlists_9_18_19.json')

In [38]:
df_playlists.head()

Unnamed: 0,added_by_id,playlist_id,user_id
0,technozem,5tW6vGqn4Z2oPxpgQSncKD,technozem
1,technozem,2zDMz1GFzSpjHT7kU4Sjqr,technozem
2,technozem,4085JGoRIJkqXboXz9Qnuw,technozem
3,technozem,0K3Z661zDkEstO4sVclqiY,technozem
4,technozem,4AehzXKZqJ5VrqPHpsmN6E,technozem


In [40]:
df_playlists[df_playlists.added_by_id != df_playlists.user_id].shape

(1066, 3)

In [44]:
playlist_curr = list(df_playlists.playlist_id)
user_curr = list(df_playlists.user_id)
feed = list(zip(user_curr, playlist_curr))

In [46]:
track_data = []
for user, playlist in feed:
    try:
        track_data.append(parse_playlist_data(user, playlist))
    except Exception as e:
        print(e)

'NoneType' object is not subscriptable
'artists'
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
http status: 404, code:-1 - https://api.spotify.com/v1/users/1243463050/playlists/37i9dQZF1E4kfGdEIPByvv/tracks?limit=100&offset=0&fields=items%2Ctotal%2Cnext:
 Not found.
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'artists'
'artists'
'NoneType' object is not subscriptable
'artists'
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
retrying ...1secs


In [49]:
len(track_data)

8538

In [51]:
pd.DataFrame(track_data).to_json('track_data_9_18_19.json')

In [53]:
pd.DataFrame(track_data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,"{'user_id': 'technozem', 'playlist_id': '5tW6v...","{'user_id': 'technozem', 'playlist_id': '5tW6v...","{'user_id': 'technozem', 'playlist_id': '5tW6v...","{'user_id': 'technozem', 'playlist_id': '5tW6v...","{'user_id': 'technozem', 'playlist_id': '5tW6v...","{'user_id': 'technozem', 'playlist_id': '5tW6v...","{'user_id': 'technozem', 'playlist_id': '5tW6v...","{'user_id': 'technozem', 'playlist_id': '5tW6v...","{'user_id': 'technozem', 'playlist_id': '5tW6v...","{'user_id': 'technozem', 'playlist_id': '5tW6v...",...,,,,,,,,,,
1,"{'user_id': 'technozem', 'playlist_id': '2zDMz...","{'user_id': 'technozem', 'playlist_id': '2zDMz...","{'user_id': 'technozem', 'playlist_id': '2zDMz...","{'user_id': 'technozem', 'playlist_id': '2zDMz...","{'user_id': 'technozem', 'playlist_id': '2zDMz...","{'user_id': 'technozem', 'playlist_id': '2zDMz...","{'user_id': 'technozem', 'playlist_id': '2zDMz...","{'user_id': 'technozem', 'playlist_id': '2zDMz...","{'user_id': 'technozem', 'playlist_id': '2zDMz...","{'user_id': 'technozem', 'playlist_id': '2zDMz...",...,,,,,,,,,,
2,"{'user_id': 'technozem', 'playlist_id': '4085J...","{'user_id': 'technozem', 'playlist_id': '4085J...","{'user_id': 'technozem', 'playlist_id': '4085J...","{'user_id': 'technozem', 'playlist_id': '4085J...","{'user_id': 'technozem', 'playlist_id': '4085J...","{'user_id': 'technozem', 'playlist_id': '4085J...","{'user_id': 'technozem', 'playlist_id': '4085J...","{'user_id': 'technozem', 'playlist_id': '4085J...","{'user_id': 'technozem', 'playlist_id': '4085J...","{'user_id': 'technozem', 'playlist_id': '4085J...",...,,,,,,,,,,
3,"{'user_id': 'technozem', 'playlist_id': '0K3Z6...","{'user_id': 'technozem', 'playlist_id': '0K3Z6...","{'user_id': 'technozem', 'playlist_id': '0K3Z6...","{'user_id': 'technozem', 'playlist_id': '0K3Z6...","{'user_id': 'technozem', 'playlist_id': '0K3Z6...","{'user_id': 'technozem', 'playlist_id': '0K3Z6...","{'user_id': 'technozem', 'playlist_id': '0K3Z6...","{'user_id': 'technozem', 'playlist_id': '0K3Z6...","{'user_id': 'technozem', 'playlist_id': '0K3Z6...","{'user_id': 'technozem', 'playlist_id': '0K3Z6...",...,,,,,,,,,,
4,"{'user_id': 'technozem', 'playlist_id': '4Aehz...","{'user_id': 'technozem', 'playlist_id': '4Aehz...","{'user_id': 'technozem', 'playlist_id': '4Aehz...","{'user_id': 'technozem', 'playlist_id': '4Aehz...","{'user_id': 'technozem', 'playlist_id': '4Aehz...","{'user_id': 'technozem', 'playlist_id': '4Aehz...","{'user_id': 'technozem', 'playlist_id': '4Aehz...","{'user_id': 'technozem', 'playlist_id': '4Aehz...","{'user_id': 'technozem', 'playlist_id': '4Aehz...","{'user_id': 'technozem', 'playlist_id': '4Aehz...",...,,,,,,,,,,


In [54]:
new_list = sum(track_data,[])

In [55]:
df_all_test = pd.DataFrame(new_list)

In [56]:
df_all_test.head()

Unnamed: 0,added_by_id,album_id,album_name,artist_ids,artist_names,date_added,playlist_id,track_id,track_name,user_id
0,technozem,0d0ONE5rak6Q91XjwKcJvN,Vurstep,"[7ugvHO0W3IoAWzOgKrHxqr, 0K1lHu1BP65Z1DErnljxUw]","[Appleblim, Forest Drive West]",2018-12-04T21:27:55Z,5tW6vGqn4Z2oPxpgQSncKD,5huzNc7H2kv2qNq7kqscHY,Vurstep - Forest Drive West Remix,technozem
1,technozem,2qz8u01gOb8Lb7KaTR90DQ,Marble,"[7asRTH6SKIMKZZ59Iw2eA5, 2iWOFT9U8InefnarwZUmv0]","[Gnork, Douala]",2018-12-12T21:33:25Z,5tW6vGqn4Z2oPxpgQSncKD,7Hhfn4AkePvX0jW4jOJPQ3,Space Jam (feat. Douala),technozem
2,technozem,0MhqzH66vEpY9fXdEz2zV4,Named,[7d51SgJwoUOPISsskT0lD4],[EOD],2018-12-05T10:18:36Z,5tW6vGqn4Z2oPxpgQSncKD,34QLCZF2VtlHLPNkTH58bZ,Exham Priory - Original Mix,technozem
3,technozem,0C8vASB72lAeufmEymy0G4,Dull Clunk,[4n9YVkzW4gl64xbOSLqeEl],[Garies],2018-12-27T20:30:53Z,5tW6vGqn4Z2oPxpgQSncKD,5dE3OUB3RhccA3dNhxc3OW,Modified By This,technozem
4,technozem,79c2iZmw26LF2jhafHQPIl,Sick Parrots,[6YlLUbef5d4ptmxkMKsrlK],[Voiski],2018-12-12T07:38:36Z,5tW6vGqn4Z2oPxpgQSncKD,6aMvQmtQKOPTc022sDM0Tv,Blue Flag,technozem


In [7]:
df_all_test = pd.read_json('track_data_9_18_19.json')

In [8]:
df_all_test.head()

Unnamed: 0,added_by_id,album_id,album_name,artist_ids,artist_names,date_added,playlist_id,track_id,track_name,user_id,artists_join,artist_and_track,rating
0,technozem,0d0ONE5rak6Q91XjwKcJvN,Vurstep,"[7ugvHO0W3IoAWzOgKrHxqr, 0K1lHu1BP65Z1DErnljxUw]","[Appleblim, Forest Drive West]",2018-12-04T21:27:55Z,5tW6vGqn4Z2oPxpgQSncKD,5huzNc7H2kv2qNq7kqscHY,Vurstep - Forest Drive West Remix,technozem,Appleblim___Forest Drive West,Appleblim___Forest Drive West|||||Vurstep - Fo...,1
1,technozem,2qz8u01gOb8Lb7KaTR90DQ,Marble,"[7asRTH6SKIMKZZ59Iw2eA5, 2iWOFT9U8InefnarwZUmv0]","[Gnork, Douala]",2018-12-12T21:33:25Z,5tW6vGqn4Z2oPxpgQSncKD,7Hhfn4AkePvX0jW4jOJPQ3,Space Jam (feat. Douala),technozem,Gnork___Douala,Gnork___Douala|||||Space Jam (feat. Douala),1
10,technozem,4JooAi5hHhUuJNrbiseE8X,At the Controls,[68Wb5Pcy71lLaKdIB6cBA5],[Breakage],2018-12-11T20:02:07Z,5tW6vGqn4Z2oPxpgQSncKD,5NLRRESuSXLQTPgdqAECCq,Rudeboy Stuff,technozem,Breakage,Breakage|||||Rudeboy Stuff,1
100,technozem,6fXNAGf1ihG0B0Sck2Mo1l,Metropolis,[0Ij7th9uWcDVYNAIOn5W22],[Kornél Kovács],2018-12-29T17:28:10Z,5tW6vGqn4Z2oPxpgQSncKD,0ajH7MyTiRhjUbZVYVgRPc,Panda,technozem,Kornél Kovács,Kornél Kovács|||||Panda,1
1000,technozem,3YQASaeJPm3OxUSCP6Qfo9,GROEF - Des avonds in klein maneschijn,[7DksXfhuJLdqtyHnoKCJLI],[GROEF],2011-04-15T16:06:11Z,4AehzXKZqJ5VrqPHpsmN6E,6sihMoVzSFhHtr2p556f5Z,Jan mijne man/Andro/Andro GROEF,technozem,GROEF,GROEF|||||Jan mijne man/Andro/Andro GROEF,1


In [68]:
df_all_test['rating'] = 1

In [63]:
df_all_test['artists_join'] = df_all_test.artist_names.apply(lambda x: '___'.join(x))

In [65]:
df_all_test['artist_and_track'] = df_all_test.artists_join + '|||||' + df_all_test.track_name

In [None]:
df_all_test = df_all_test[df_all_test.duplicated(subset='artist_and_track', keep=False)] # run to get rid of songs that only occur once

In [14]:
df_all_test.head()

Unnamed: 0,added_by_id,album_id,album_name,artist_ids,artist_names,date_added,playlist_id,track_id,track_name,user_id,rating,artists_join,artist_and_track
0,technozem,0d0ONE5rak6Q91XjwKcJvN,Vurstep,"[7ugvHO0W3IoAWzOgKrHxqr, 0K1lHu1BP65Z1DErnljxUw]","[Appleblim, Forest Drive West]",2018-12-04T21:27:55Z,5tW6vGqn4Z2oPxpgQSncKD,5huzNc7H2kv2qNq7kqscHY,Vurstep - Forest Drive West Remix,technozem,1,Appleblim___Forest Drive West,Appleblim___Forest Drive West|||||Vurstep - Fo...
1,technozem,2qz8u01gOb8Lb7KaTR90DQ,Marble,"[7asRTH6SKIMKZZ59Iw2eA5, 2iWOFT9U8InefnarwZUmv0]","[Gnork, Douala]",2018-12-12T21:33:25Z,5tW6vGqn4Z2oPxpgQSncKD,7Hhfn4AkePvX0jW4jOJPQ3,Space Jam (feat. Douala),technozem,1,Gnork___Douala,Gnork___Douala|||||Space Jam (feat. Douala)
10,technozem,4JooAi5hHhUuJNrbiseE8X,At the Controls,[68Wb5Pcy71lLaKdIB6cBA5],[Breakage],2018-12-11T20:02:07Z,5tW6vGqn4Z2oPxpgQSncKD,5NLRRESuSXLQTPgdqAECCq,Rudeboy Stuff,technozem,1,Breakage,Breakage|||||Rudeboy Stuff
100,technozem,6fXNAGf1ihG0B0Sck2Mo1l,Metropolis,[0Ij7th9uWcDVYNAIOn5W22],[Kornél Kovács],2018-12-29T17:28:10Z,5tW6vGqn4Z2oPxpgQSncKD,0ajH7MyTiRhjUbZVYVgRPc,Panda,technozem,1,Kornél Kovács,Kornél Kovács|||||Panda
1000,technozem,3YQASaeJPm3OxUSCP6Qfo9,GROEF - Des avonds in klein maneschijn,[7DksXfhuJLdqtyHnoKCJLI],[GROEF],2011-04-15T16:06:11Z,4AehzXKZqJ5VrqPHpsmN6E,6sihMoVzSFhHtr2p556f5Z,Jan mijne man/Andro/Andro GROEF,technozem,1,GROEF,GROEF|||||Jan mijne man/Andro/Andro GROEF


In [15]:
# df_all_test.to_json('track_data_9_18_19.json')

In [3]:
df_all_test = pd.read_json('track_data_9_18_19.json')

In [58]:
df_all_test.shape

(1022972, 10)

In [59]:
df_all_test.track_id.nunique()

497590

In [60]:
track_counts = df_all_test.track_id.value_counts()

In [61]:
track_counts

00HMEiBMDkbNCHLJL6zqVk    422
2374M0fQpWi3dLnB54qaLX    409
24CXuh2WNpgeSYUOvz14jk    168
2xLMifQCjDGFmkHkpNLD9h    155
0bYg9bo50gSsH3LtXe2SQn    143
2Fxmhks0bxGSBdJ92vM42m    138
7w87IxuO7BDcJ3YUqCyMTT    114
7ycWLEP1GsNjVvcjawXz3z    112
7dt6x5M1jzdTEt8oCbisTK    111
7sO5G9EABYOXQKNPNiE9NR    110
6K4t31amVTZDgR3sKmwUJJ    110
2JvzF1RMd7lE3KmFlsyZD8    110
40riOy7x9W7GXjyGp4pjAv    109
3FtYbEfBqAlGO46NUDQSAt    106
4qKcDkK6siZ7Jp1Jb4m0aL    106
1jJci4qxiYcOHhQR247rEU    106
6MWtB6iiXyIwun0YzU6DFP    103
7m9OqQk4RVRkw9JJdeAw96    103
3KkXRkHbMCARz0aVfEt68P    101
2IRZnDFmlqMuOrYOLnZZyc    101
08mG3Y1vljYA6bvDt4Wqkj    101
0I3q5fE6wg7LIfHGngUTnV    101
6gBFPUFcJLzWGx4lenP6h2    101
2dpaYNEQHiRxtZbfNsse99    100
0d28khcov6AiegSCpG5TuT     99
7DcvwMAiqKJQD1rrdfxSDx     99
5hVghJ4KaYES3BFUATCYn0     95
2aoo2jlRnM3A0NyLQqMN2f     94
7BKLCZ1jbUBVqRi2FVlTVw     92
2WfaOiMkCvy7F5fcp2zZ8L     91
                         ... 
4GVyG4iljpmVOt1CR367o7      1
1X2npDWQpg9lvM8MOw2mqm      1
7x40qXAXv4

In [19]:
frequent_tracks.index

Index(['0bYg9bo50gSsH3LtXe2SQn', '2xLMifQCjDGFmkHkpNLD9h',
       '2JvzF1RMd7lE3KmFlsyZD8', '2374M0fQpWi3dLnB54qaLX',
       '40riOy7x9W7GXjyGp4pjAv', '7sO5G9EABYOXQKNPNiE9NR',
       '7w87IxuO7BDcJ3YUqCyMTT', '2dpaYNEQHiRxtZbfNsse99',
       '7ycWLEP1GsNjVvcjawXz3z', '2IRZnDFmlqMuOrYOLnZZyc',
       '7m9OqQk4RVRkw9JJdeAw96', '2Fxmhks0bxGSBdJ92vM42m',
       '7dt6x5M1jzdTEt8oCbisTK', '4qKcDkK6siZ7Jp1Jb4m0aL',
       '6gBFPUFcJLzWGx4lenP6h2', '1LzNfuep1bnAUR9skqdHCK',
       '08mG3Y1vljYA6bvDt4Wqkj', '1jJci4qxiYcOHhQR247rEU',
       '0rTV5WefWd1J3OwIheTzxM', '2aoo2jlRnM3A0NyLQqMN2f',
       '6MWtB6iiXyIwun0YzU6DFP', '0I3q5fE6wg7LIfHGngUTnV'],
      dtype='object')

In [10]:
frequent_tracks = track_counts[track_counts > 70]

In [25]:
df_frequent = pd.DataFrame(frequent_tracks.index)
df_frequent.columns = ['track_id']

In [34]:
df_frequent = df_frequent.set_index(df_frequent.track_id)

In [38]:
df_frequent.rename(columns={'track_id': 'test_track_id'}, inplace=True)

In [73]:
all_tops = df_frequent.join(df_all_test[['track_name', 'track_id', 'artist_names']].set_index('track_id'), on = 'track_id')

In [84]:
df_all_working = pd.DataFrame(all_tops.apply(lambda x: pd.Series(x['artist_names']),axis=1).stack().reset_index(level=1, drop=True)).join(df_all_test[['track_name', 'track_id']].set_index('track_id'), on = 'track_id')


In [85]:
df_all_working.drop_duplicates()

Unnamed: 0_level_0,0,track_name
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0bYg9bo50gSsH3LtXe2SQn,Mariah Carey,All I Want for Christmas Is You
2xLMifQCjDGFmkHkpNLD9h,Travis Scott,SICKO MODE
2JvzF1RMd7lE3KmFlsyZD8,J. Cole,MIDDLE CHILD
2374M0fQpWi3dLnB54qaLX,Toto,Africa
40riOy7x9W7GXjyGp4pjAv,Eagles,Hotel California - 2013 Remaster
7sO5G9EABYOXQKNPNiE9NR,Offset,Ric Flair Drip (& Metro Boomin)
7sO5G9EABYOXQKNPNiE9NR,Metro Boomin,Ric Flair Drip (& Metro Boomin)
7w87IxuO7BDcJ3YUqCyMTT,Foster The People,Pumped Up Kicks
2dpaYNEQHiRxtZbfNsse99,Marshmello,Happier
2dpaYNEQHiRxtZbfNsse99,Bastille,Happier
