# Data Collection

Spotify Million Playlist Dataset

In [1]:
import requests
import pandas as pd
import os
import json
import copy
import datetime

In [2]:
dataset_dir = os.path.join('..','..','datasets','spotify_million_playlist_dataset','data')

## Load a single JSON as a dataframe


In [3]:
slice_start = 0
slice_end = 999

json_file_name = f'mpd.slice.{slice_start}-{slice_end}.json'

json_file_path = os.path.join(dataset_dir,json_file_name)

with open(json_file_path, 'r') as f:
    json_file = json.load(f)

playlists = json_file['playlists']

In [4]:
len(playlists)

1000

In [5]:
p_ix = 0

playlists[p_ix]['name']

'Throwbacks'

In [6]:
t_ix = 0
playlists[p_ix]['tracks'][t_ix]

{'pos': 0,
 'artist_name': 'Missy Elliott',
 'track_uri': 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',
 'artist_uri': 'spotify:artist:2wIVse2owClT7go1WT98tk',
 'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)',
 'album_uri': 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K',
 'duration_ms': 226863,
 'album_name': 'The Cookbook'}

In [7]:
data = []
row = {
    "track_name":"",
    "artist_name":"",
    "album_name":"",
    "playlist_name":"",
    "track_uri":"",
    "artist_uri":"",
}

for p_ix,playlist in enumerate(playlists):
    row['playlist_name'] = playlist['name']
    for track in playlist['tracks']:
        row['track_name'] = track['track_name']
        row['artist_name'] = track['artist_name']
        row['album_name'] = track['album_name']
        row['track_uri'] = track['track_uri']
        row['artist_uri'] = track['artist_uri']
        data.append(copy.deepcopy(row))

mp_df = pd.DataFrame(data = data)
mp_df.head()

Unnamed: 0,track_name,artist_name,album_name,playlist_name,track_uri,artist_uri
0,Lose Control (feat. Ciara & Fat Man Scoop),Missy Elliott,The Cookbook,Throwbacks,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk
1,Toxic,Britney Spears,In The Zone,Throwbacks,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4
2,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),Throwbacks,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m
3,Rock Your Body,Justin Timberlake,Justified,Throwbacks,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7
4,It Wasn't Me,Shaggy,Hot Shot,Throwbacks,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij


In [8]:
len(mp_df['track_uri'].unique())

34443

In [9]:
len(mp_df['track_name'].unique())

30049

In [10]:
mp_df['playlist_name'].unique()[:10]

array(['Throwbacks', 'Awesome Playlist', 'korean ', 'mat', '90s',
       'Wedding', 'I Put A Spell On You', '2017', 'BOP', 'old country '],
      dtype=object)

## Diagnose Track Name vs Track URI

Why are there more unique track_uris than track_names?

In [11]:
n_names = len(mp_df['track_name'].unique())

multi_id_tracks = []
no_id_tracks = []
unique_tracks = 0

for i,track_name in enumerate(mp_df['track_name'].unique()):
    if (i*100/n_names) % 10 == 0:
        print((i*100/n_names))
    filter = mp_df['track_name'] == track_name
    if len(mp_df[filter]['track_uri'].unique()) > 1:
        multi_id_tracks.append({
            "track_name":track_name,
            "track_ids":list(mp_df[filter]['track_uri'].unique()),
            "albums":list(mp_df[filter]['album_name'].unique())
            })
        break
    elif len(mp_df[filter]['track_uri'].unique()) == 0:
        no_id_tracks.append(track_name)
        break
    else:
        unique_tracks += 1

0.0


In [12]:
len(multi_id_tracks)

1

In [13]:
multi_id_tracks

[{'track_name': 'Toxic',
  'track_ids': ['spotify:track:6I9VzXrHxO9rA9A5euc8Ak',
   'spotify:track:34ccBqL3xNaCzPxr0UqoEw'],
  'albums': ['In The Zone', 'Rattlesnake!']}]

### Conclusion

Tracks with the same name and artist can still belong to different albums.

Moving forward (track_name, artist) should be considered a unique identifier for a song.

## Compile dataframe from all JSON files

In [14]:
def load_json_playlists(slice_start):
    slice_end = slice_start+999

    json_file_name = f'mpd.slice.{slice_start}-{slice_end}.json'

    json_file_path = os.path.join(dataset_dir,json_file_name)

    with open(json_file_path, 'r') as f:
        json_file = json.load(f)

    playlists = json_file['playlists']
    return playlists

In [15]:
slice_start = 0
output_dir = os.path.join('..','..','datasets','spotify_million_playlist_dataset','pkl')

start_time = datetime.datetime.now()
while slice_start < 1000000:
    data = []
    row = {
        "track_name":"",
        "artist_name":"",
        "album_name":"",
        "playlist_name":"",
        "track_uri":"",
        "artist_uri":"",
    }
    if((slice_start*100/1000000) % 10 == 0):
        print(f"{slice_start}, {(slice_start*100/1000000)}%, {datetime.datetime.now()-start_time}")
    playlists = load_json_playlists(slice_start)

    for p_ix,playlist in enumerate(playlists):
        row['playlist_name'] = playlist['name']
        for track in playlist['tracks']:
            row['track_name'] = track['track_name']
            row['artist_name'] = track['artist_name']
            row['album_name'] = track['album_name']
            row['track_uri'] = track['track_uri']
            row['artist_uri'] = track['artist_uri']
            data.append(copy.deepcopy(row))

    mp_df = pd.DataFrame(data = data)
    output_file_path = os.path.join(output_dir,f'table-{slice_start}-{slice_start+999}.pkl')
    mp_df.to_pickle(output_file_path)
    slice_start += 1000

0, 0.0%, 0:00:00.026247
100000, 10.0%, 0:01:32.410928
200000, 20.0%, 0:03:05.799018
300000, 30.0%, 0:04:38.968345
400000, 40.0%, 0:06:12.867880
500000, 50.0%, 0:07:53.683155
600000, 60.0%, 0:09:29.330184
700000, 70.0%, 0:11:22.206935
800000, 80.0%, 0:13:19.086070
900000, 90.0%, 0:15:04.608754


In [2]:
# big_df = []
# file_count = 0
# start_time = datetime.datetime.now()
# for filename in os.listdir(output_dir):
#     if((file_count*100/1000) % 10 == 0):
#         print(f"{file_count}, {(file_count*100/1000)}%, {datetime.datetime.now()-start_time}")
#     file_path = os.path.join(output_dir, filename)
#     if os.path.isfile(file_path):  # skip subdirectories
#         if len(big_df) == 0:
#             big_df = pd.read_pickle(file_path)
#         else:
#             add_df = pd.read_pickle(file_path)
#             big_df = pd.concat((big_df,add_df))
#         file_count += 1
# output_file_path = os.path.join(output_dir,f'table.pkl')
# big_df.to_pickle(output_file_path)