In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import sys
import json
import re
import collections
import os
from tqdm import tqdm

# Million Playlist Challenge!
This project uses the dataset from Spotify's published Million Playlist Dataset and challenge, where the goal is to make a track suggestion system from a large dataset of pre-existing user made Spotify playlists. Challenges in the competition include suggesting tracks for playlists that:
   1. Predict tracks for a playlist given its title only
   2. Predict tracks for a playlist given its title and the first track
   3. Predict tracks for a playlist given its title and the first 5 tracks
   4. Predict tracks for a playlist given its first 5 tracks (no title)
   5. Predict tracks for a playlist given its title and the first 10 tracks
   6. Predict tracks for a playlist given its first ten tracks (no title)
   7. Predict tracks for a playlist given its title and the first 25 tracks
   8. Predict tracks for a playlist given its title and 25 random tracks
   9. Predict tracks for a playlist given its title and the first 100 tracks
   10. Predict tracks for a playlist given its title and 100 random tracks
   
So within the scope of the project, the recommendation system needs to be able to generate tracks based on a title string (natural language problem) or on a set of seed tracks. 

My personal goal is to make an interface for using and connecting the systems I make for this project, and possibly incorporate OpenAI assistants/chatbots and Spotify account authentication. 

# Dataset Summary

In [2]:
'''
this code was taken from the provided 'deeper_statistics.py' script. 
I added a progress bar and changed the output slightly to run the script in this notebook.
'''

total_playlists = 0
total_tracks = 0
tracks = set()
playlistlist = []
artists = set()
albums = set()
titles = set()
ntitles = set()
full_title_histogram = collections.Counter()
title_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()

quick = False
max_files_for_quick_processing = 50


def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in tqdm(sorted(filenames)):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            process_info(mpd_slice["info"])
            for playlist in mpd_slice["playlists"]:
                process_playlist(playlist)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break

    show_summary()


def show_summary():
    print()
    print("number of playlists", total_playlists)
    print("number of tracks", total_tracks)
    print("number of unique tracks", len(tracks))
    print("number of unique albums", len(albums))
    print("number of unique artists", len(artists))
    print("number of unique titles", len(titles))
    print("number of unique normalized titles", len(ntitles))
    print("avg playlist length", float(total_tracks) / total_playlists)
    print()
    print("full playlist titles")
    for title, count in full_title_histogram.most_common():
        print("%7d %s" % (count, title))
    print()

    print("top playlist titles")
    for title, count in title_histogram.most_common():
        print("%7d %s" % (count, title))
    print()

    print("top tracks")
    for track, count in track_histogram.most_common(10000):
        print("%7d %s" % (count, track))

    print()
    print("top artists")
    for artist, count in artist_histogram.most_common(10000):
        print("%7d %s" % (count, artist))


def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", " ", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name


def process_playlist(playlist):
    global total_playlists, total_tracks, playlistlist

    total_playlists += 1
    #playlistlist.append(playlist)
    # print playlist['playlist_id'], playlist['name']

    titles.add(playlist["name"])
    nname = normalize_name(playlist["name"])
    ntitles.add(nname)
    title_histogram[nname] += 1
    full_title_histogram[playlist["name"].lower()] += 1

    for track in playlist["tracks"]:
        total_tracks += 1
        albums.add(track["album_uri"])
        tracks.add(track["track_uri"])
        artists.add(track["artist_uri"])

        full_name = track["track_name"] + " by " + track["artist_name"]
        artist_histogram[track["artist_name"]] += 1
        track_histogram[full_name] += 1


def process_info(info):
    for k, v in list(info.items()):
        #print("%-20s %s" % (k + ":", v))
        pass
    #print()

In [3]:
path = 'data/data/'
process_mpd(path)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [08:35<00:00,  1.94it/s]



number of playlists 1000000
number of tracks 66346428
number of unique tracks 2262292
number of unique albums 734684
number of unique artists 295860
number of unique titles 92944
number of unique normalized titles 17381
avg playlist length 66.346428

full playlist titles
   9256 chill
   8724 country
   8045 rap
   7644 workout
   7413 oldies
   6594 christmas
   6351 rock
   5487 party
   5141 throwback
   4603 worship
   4584 jams
   4181 summer
   4154 new
   4100 feels
   3756 lit
   3630 disney
   3484 throwbacks
   3433 music
   3335 sleep
   3180 edm
   3104 vibes
   3074 running
   3048 classic rock
   2943 classics
   2900 gym
   2758 dance
   2736 pop
   2614 car
   2566 road trip
   2527 hype
   2526 run
   2468 work
   2446 old school
   2388 2017
   2332 random
   2273 hip hop
   2245 wedding
   2092 love
   2068 shower
   2067 reggae
   2039 relax
   1944 summer 2017
   1898 alternative
   1893 christian
   1876 classical
   1868 feel good
   1846 2016
   1820 r&b
   181

so lets make a NLP-esque dataset with:
* vocab of tracks, list of each unique track in our dataset
* corpus of playlists, list of playlists as lists of track tokens

In [62]:
'''
these functions are used for parsing the dataset and organizing it into more usable pandas dataframes. 
the important part is the function make_dataset, which outputs two important dataframes from a number of data slices:
* trackdf has all included data for each unique track included within the playlists in the processed slices
* playlistdf has all included data, including uri tracklists, for each playlist in the processed slices
'''

def process_file(filename, dirpath):
    # given a path to a file and a path to the data folder, return a json-like dict of the specified file
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        fullpath = os.sep.join((dirpath, filename))
        f = open(fullpath)
        js = f.read()
        f.close()
        mpd_slice = json.loads(js)
        return mpd_slice
    else:
        print("file isn't right")

def track_dataframe(track_list):
    # given a list of track dictionaries, like from slice['playlists'][n]['tracks'], make a dataframe
    df = pd.DataFrame(track_list)
    if 'track_uri' in df.columns:
        cols = ['track_uri'] + [col for col in df.columns if col not in  ['track_uri', 'pos']]
        df = df[cols]
    return df
        
def unpack_slice(slicedict):
    # given a json-like dict from a slice of the dataset, make a dataframe of unique tracks and of playlist data
    master_df = pd.DataFrame()
    playlists_df = []
    for playlist in (slicedict['playlists']):
        track_df = create_track_dataframe(playlist['tracks'])
        playlist['tracklist'] = track_df['track_uri'].to_list()
        playlists_df.append(playlist)
        master_df = pd.concat([master_df, track_df], ignore_index=True).drop_duplicates(subset='track_uri')
    playlists_df = pd.DataFrame(playlists_df).drop('tracks', axis = 1)
    return master_df, playlists_df

def make_dataset(data_path, slice_limit = None):
    # given a path to the data folder, read each file and make a dataframe of unique tracks and of playlists
    master_track_df = pd.DataFrame()
    master_playlist_df = pd.DataFrame()
    count = 0
    for filename in tqdm(os.listdir(data_path)):
        if slice_limit and count >= slice_limit:
            break
        mpd_slice = process_file(filename, data_path)
        track_df, playlist_df = unpack_slice(mpd_slice)
        master_track_df = pd.concat([master_track_df, track_df], ignore_index=True).drop_duplicates(subset='track_uri')
        master_playlist_df = pd.concat([master_playlist_df, playlist_df], ignore_index=True)
        count += 1
    return master_track_df, master_playlist_df

def get_df_size(df):
    # given a df, return how big it is in mb
    column_sizes = df.memory_usage(deep=True)
    total_size = column_sizes.sum()
    size_mb = total_size / (1024 * 1024)  
    return size_mb

In [60]:
trackdf, playlistdf = make_dataset('data/data/', slice_limit = 10)

  1%|▊                                                                             | 10/1000 [03:00<4:57:10, 18.01s/it]


In [63]:
print(get_df_size(trackdf))
trackdf.head(5)

83.84786891937256


Unnamed: 0,track_uri,artist_name,artist_uri,track_name,album_uri,duration_ms,album_name
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit)
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot


In [64]:
print(get_df_size(playlistdf))
playlistdf.head(5)

7.717738151550293


Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,duration_ms,num_artists,tracklist,description
0,Throwbacks,False,0,1493424000,52,47,1,6,11532414,37,"[spotify:track:0UaMYEvWZi0ZqiDOoHU3YI, spotify...",
1,Awesome Playlist,False,1,1506556800,39,23,1,5,11656470,21,"[spotify:track:2HHtWyy5CgaQbC7XSoOb0e, spotify...",
2,korean,False,2,1505692800,64,51,1,18,14039958,31,"[spotify:track:74tqql9zP6JjF5hjkHHUXp, spotify...",
3,mat,False,3,1501027200,126,107,1,4,28926058,86,"[spotify:track:4WJ7UMD4i6DOPzyXU5pZSz, spotify...",
4,90s,False,4,1401667200,17,16,2,7,4335282,16,"[spotify:track:4iCGSi1RonREsPtfEKYj5b, spotify...",
