In [66]:
import os
import sys
import glob
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import random
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [112]:
playlists_df = pd.read_csv("playlist_data.csv")

In [235]:
total_songs = pd.read_csv("total_songs_clean.csv")

In [115]:
most_popular_songs = pd.read_csv("top70ksongs.csv")

In [114]:
total_songs.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,type,id,uri,track_href,analysis_url,duration_ms,time_signature,artist_name,track_name,album_name
0,0.369,0.223,9,-15.993,1,0.0564,0.957,0.86,0.124,0.212,...,audio_features,3R8bbBLfCrl3wXELLDuZiA,spotify:track:3R8bbBLfCrl3wXELLDuZiA,https://api.spotify.com/v1/tracks/3R8bbBLfCrl3...,https://api.spotify.com/v1/audio-analysis/3R8b...,196733,4,Kagan Publishing,The Road to Lucia,Brain Boosters: Inspired Thinking
1,0.664,0.605,4,-5.477,0,0.258,0.242,0.0,0.161,0.397,...,audio_features,5Zk7ELDEXqcHZk2euHPVb9,spotify:track:5Zk7ELDEXqcHZk2euHPVb9,https://api.spotify.com/v1/tracks/5Zk7ELDEXqcH...,https://api.spotify.com/v1/audio-analysis/5Zk7...,192320,4,Cherine Anderson,Coming Over Tonight (feat. Chuck Fender),The Introduction - EP
2,0.74,0.712,9,-7.26,0,0.0814,0.0718,0.0,0.33,0.742,...,audio_features,5tg8mN8KqRfwIvkXF0tRr4,spotify:track:5tg8mN8KqRfwIvkXF0tRr4,https://api.spotify.com/v1/tracks/5tg8mN8KqRfw...,https://api.spotify.com/v1/audio-analysis/5tg8...,276093,4,Kirk Franklin,Keep Your Head,Hero
3,0.623,0.863,1,-8.161,0,0.0776,0.000811,2e-06,0.128,0.734,...,audio_features,10qeFHxbayfrkH2kX9kTnv,spotify:track:10qeFHxbayfrkH2kX9kTnv,https://api.spotify.com/v1/tracks/10qeFHxbayfr...,https://api.spotify.com/v1/audio-analysis/10qe...,136067,4,Limp,Bug Dance,Pop & Disorderly
4,0.413,0.0302,10,-26.692,1,0.035,0.957,0.443,0.113,0.254,...,audio_features,18LVcYNiF05oDfEQ7HEhNR,spotify:track:18LVcYNiF05oDfEQ7HEhNR,https://api.spotify.com/v1/tracks/18LVcYNiF05o...,https://api.spotify.com/v1/audio-analysis/18LV...,233627,4,Ludwig van Beethoven,Beethoven : Symphony No.8 in F major Op.93 : I...,Beethoven : Symphonies Nos 1 - 9


In [116]:
playlists_df.head()

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,fid
0,0,0,Sleeping At Last,spotify:track:2d7LPtieXdIYzf7yHPooWd,spotify:artist:0MeLMJJcouYXCymQSHPn8g,Chasing Cars,spotify:album:0UIIvTTWNB3gRQWFoxoEDh,242564,"Covers, Vol. 2",284
1,0,1,Rachael Yamagata,spotify:track:0y4TKcc7p2H6P0GJlt01EI,spotify:artist:7w0qj2HiAPIeUcoPogvOZ6,Elephants,spotify:album:6KzK9fDNmj7GHFbcE4gVJD,253701,Elephants...Teeth Sinking Into Heart,284
2,0,2,The Cinematic Orchestra,spotify:track:6q4c1vPRZREh7nw3wG7Ixz,spotify:artist:32ogthv0BdaSMPml02X9YB,That Home,spotify:album:5cPHT4yMCfETLRYAoBFcOZ,103920,Ma Fleur,284
3,0,3,The Cinematic Orchestra,spotify:track:54KFQB6N4pn926IUUYZGzK,spotify:artist:32ogthv0BdaSMPml02X9YB,To Build A Home,spotify:album:5cPHT4yMCfETLRYAoBFcOZ,371320,Ma Fleur,284
4,0,4,Leon Bridges,spotify:track:0NeJjNlprGfZpeX2LQuN6c,spotify:artist:3qnGvpP8Yth1AqSBMqON5x,River,spotify:album:4svLfrPPk2npPVuI4kXPYg,238560,Coming Home,284


## Create a baseline model class

For our baseline model, we decided to recommend songs from a set of candidate songs according to which of these candidates have the highest average cosine similarity to the set of seed songs. To elaborate, the function is applied pairwise between each candidate and each seed song and then the output is averaged across the seed songs giving an average similarity for each candidate. In other words, playlists are generated using an unsupervised algorithm that attempts to maximize a score function and can generate playlists of any length up to the number of provided candidate songs. For our purposes, we used a collection of around 220,000 candidate songs, which is noteably 1/10th of all unique songs used to create playlists in our dataset.

This approach is motivated by the assumption that different playlists have different distributions of features, perhaps according to genre for example, and that songs that are tonally similar will be a good recommendation for that playlist.

This leaves us with a couple of ways to improve performance. First, our baseline model used `"danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "mode"` to measure difference, however this selection of features could certainly be tuned for model performance. Second, this similarity function could be changed completely, perhaps using a supervised ML algorithm such as a neural network where the algorithm predicts similarity between songs trained on top spotify playlists, while preserving the structure of our baseline model.

In [271]:
class Baseline:
    def __init__(self, X_train, X_test, songs_df, candidate_song_ids = None, 
                 candidate_percentage = 0.3, similarity = "cosine"):
        self.X_train = X_train
        self.X_test = X_test
        self.songs_df = songs_df
        sids = set(songs_df["id"])
        self.sids = sids
        if candidate_song_ids:
            # make sure the ids are in the songs_df
            assert(sum([0 if i in sids else 1 for i in candidate_songs_ids]) == 0)
            self.candidate_songs = songs_df.loc[songs_df["id"].isin(candidate_song_ids)]
        else:
            self.setCandidateSongs(candidate_percentage)
        
        if similarity == "cosine":
            self.sim_func = cosine_similarity
        else:
            raise NotImplementedError("This similarity function is not currently supported.")
            
        # used song set?
        self.used_song_ids = set()
        
        
    def setCandidateSongs(self, percentage):
        ids = random.sample(list(self.sids), int(percentage * len(self.sids)))
        self.candidate_songs = self.songs_df.loc[self.songs_df["id"].isin(ids)]
        
    
    def predict(self, seed_songs_df, num_songs, feats_to_use):
        songs_to_consider = self.candidate_songs.loc[-self.candidate_songs["id"].isin(list(seed_songs_df["id"]))]
        similarities = self.sim_func(seed_songs_df[feats_to_use].values, 
                                     songs_to_consider[feats_to_use].values)
        avg_sim = np.mean(similarities, axis = 0)
        desc_order = np.argsort(-avg_sim)
        songs_desc = songs_to_consider.iloc[desc_order]
        return songs_desc[0:num_songs]
        

In [272]:
test = Baseline("a", "b", total_songs.iloc[0:1000], candidate_percentage=1)

In [273]:
available_features = ["danceability", "energy", "loudness", 
                      "speechiness", "acousticness", 
                      "instrumentalness", "liveness", "valence", "mode"]


In [274]:
predicted = test.predict(total_songs.iloc[0:99], 1000, available_features)

## Evaluating our model on our complete playlists

To evaluate our baseline model, we randomly selected 50 playlists and retrieved all of their songs to ensure that we had musical features on all of the playlist entries. Next, we split these playlists into a set of "seed" songs and evaluation songs of sizes 0.75 and 0.25, respectively. Finally, we used our baseline model to predict 500 songs that our model deemed as best matches. We then compared these predictions to the left-out test set and recorded the percentage of the playlist songs that were predicted correctly. Using this metric, our model predicted $2.5\%$ of the test set correctly on average.

Furthermore, we record the positions of the correctly predicted test set where 1 is considered a better match than 500. Looking at this data, we see that there are a few playlists where the model gets many of the test set correct and other playlists where the baseline model predicted none of the original songs. This might suggest that some playlists are tonally similar while others are less so; this may motivate another angle of approach that attempts to use related artists to those found in the seed songs to avoid only suggesting tonally similar songs.

In [276]:
complete_playlists = pd.read_csv("complete_playlists_50.csv")

In [285]:
unique_playlists = complete_playlists.groupby(by=["fid", "pid"]).pos.first().reset_index().drop("pos", axis=1)

In [356]:
percent = 3/4
test_size = 0
num_correct = 0
position_list = {}
random.seed(109)
for i in range(len(unique_playlists)): #len(unique_playlists)
    fid = unique_playlists.loc[i, "fid"]
    pid = unique_playlists.loc[i, "pid"]
    playlist = complete_playlists.loc[(complete_playlists["fid"] == fid) & 
                                      (complete_playlists["pid"] == pid)].reset_index(drop=True)
    
    seed_ids = random.sample(list(range(playlist.shape[0])), int(playlist.shape[0] * percent))
    seed_split = playlist.index.get_level_values(0).isin(seed_ids)
    seed = playlist.loc[seed_split]
    test = playlist.loc[~seed_split]
    test_song_ids = set(test["id"])
    test_size += test.shape[0]
    evaluate = Baseline("not", "implemented", total_songs, candidate_percentage=1)
    predictions = evaluate.predict(seed, 500, available_features).reset_index()
    num_correct += predictions.loc[predictions["id"].isin(test_song_ids)].shape[0]
    position_list[(fid, pid)] = (list(predictions.loc[predictions["id"].isin(test_song_ids)].index.get_level_values(0)))

In [357]:
position_list

{(16, 705): [],
 (29, 736): [],
 (52, 746): [],
 (112, 480): [167, 318, 358, 451],
 (113, 190): [],
 (124, 951): [],
 (149, 299): [204],
 (164, 985): [96],
 (167, 82): [],
 (172, 599): [],
 (176, 116): [],
 (226, 831): [],
 (246, 757): [361],
 (320, 851): [134, 142],
 (334, 229): [],
 (341, 453): [266, 486],
 (342, 824): [],
 (346, 794): [],
 (354, 557): [],
 (356, 420): [],
 (361, 820): [],
 (386, 719): [],
 (392, 266): [],
 (420, 39): [],
 (429, 375): [],
 (436, 869): [],
 (447, 778): [107, 157, 348, 451, 479, 485],
 (451, 640): [387],
 (508, 314): [],
 (526, 566): [],
 (560, 929): [],
 (680, 108): [196, 215],
 (718, 872): [],
 (735, 66): [],
 (759, 574): [],
 (770, 309): [],
 (812, 749): [],
 (819, 419): [],
 (823, 120): [],
 (828, 766): [336],
 (858, 844): [350, 386],
 (859, 769): [],
 (863, 519): [],
 (865, 866): [],
 (892, 979): [],
 (897, 643): [],
 (921, 300): [],
 (964, 992): [],
 (965, 905): [],
 (998, 452): []}

In [358]:
num_correct / test_size

0.02505446623093682

In [359]:
complete_playlists.loc[(complete_playlists["fid"] == 447) & 
                                      (complete_playlists["pid"] == 778)]

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,fid,...,acousticness,instrumentalness,liveness,valence,tempo,type,id,track_href,analysis_url,time_signature
2773,778,0,Calibre 50,spotify:track:49ItgqFP5shdUdK9ggonW0,spotify:artist:4jogXSSvlyMkODGSZ2wc2P,Aunque Ahora Estés Con Él,spotify:album:7qS6xf08vzZcHRgwnQsGRP,214293,Aunque Ahora Estés Con Él,447,...,0.5450,0.000000,0.3750,0.748,172.507,audio_features,49ItgqFP5shdUdK9ggonW0,https://api.spotify.com/v1/tracks/49ItgqFP5shd...,https://api.spotify.com/v1/audio-analysis/49It...,3
2774,778,1,Calibre 50,spotify:track:3imJHohXBxDGovFRQ75dUt,spotify:artist:4jogXSSvlyMkODGSZ2wc2P,Contigo,spotify:album:67FKNGl2Yujo7UFeDNajHt,210440,Contigo,447,...,0.7130,0.000236,0.3060,0.596,136.947,audio_features,3imJHohXBxDGovFRQ75dUt,https://api.spotify.com/v1/tracks/3imJHohXBxDG...,https://api.spotify.com/v1/audio-analysis/3imJ...,4
2775,778,2,Banda Los Recoditos,spotify:track:0N4d44DDwQxBxlzS79uN88,spotify:artist:4bPiOPI4V99cepEftvBYak,Me Sobrabas Tú,spotify:album:1qSHp3vmj7Wm1LmRm8JbNQ,181986,Sueño XXX,447,...,0.6980,0.000000,0.2600,0.961,154.926,audio_features,0N4d44DDwQxBxlzS79uN88,https://api.spotify.com/v1/tracks/0N4d44DDwQxB...,https://api.spotify.com/v1/audio-analysis/0N4d...,3
2776,778,3,Banda Los Recoditos,spotify:track:0LCMFLgIISg4Mab1YL5shU,spotify:artist:4bPiOPI4V99cepEftvBYak,Pistearé,spotify:album:3LLGf8yNKPKud3ErCdOkAu,144613,Me Está Gustando,447,...,0.4810,0.000000,0.3320,0.960,120.021,audio_features,0LCMFLgIISg4Mab1YL5shU,https://api.spotify.com/v1/tracks/0LCMFLgIISg4...,https://api.spotify.com/v1/audio-analysis/0LCM...,3
2777,778,4,Banda El Recodo,spotify:track:2KpFQ5ETwtC3FUGeXhw55I,spotify:artist:6AcOTCYBMvjKYy4zms0kaC,La Miel De Su Saliva,spotify:album:4Tl5QOkNgX6INDWg6mMfZB,180119,Mi Vicio Más Grande,447,...,0.4720,0.000002,0.0793,0.963,178.018,audio_features,2KpFQ5ETwtC3FUGeXhw55I,https://api.spotify.com/v1/tracks/2KpFQ5ETwtC3...,https://api.spotify.com/v1/audio-analysis/2KpF...,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2889,778,115,Banda Los Recoditos,spotify:track:2uMQxOPITUivAkAYrR02Br,spotify:artist:4bPiOPI4V99cepEftvBYak,Cuando Fuiste Mía,spotify:album:37KAZDlCcCpLZuVltkaQfQ,216013,Los Gustos Que Me Doy,447,...,0.5020,0.000000,0.0695,0.971,142.039,audio_features,2uMQxOPITUivAkAYrR02Br,https://api.spotify.com/v1/tracks/2uMQxOPITUiv...,https://api.spotify.com/v1/audio-analysis/2uMQ...,3
2890,778,116,Banda Los Recoditos,spotify:track:2K0HJOlFioRNNnkCABaYZy,spotify:artist:4bPiOPI4V99cepEftvBYak,Fuego Cruzado,spotify:album:37KAZDlCcCpLZuVltkaQfQ,158253,Los Gustos Que Me Doy,447,...,0.3830,0.000000,0.1140,0.886,210.038,audio_features,2K0HJOlFioRNNnkCABaYZy,https://api.spotify.com/v1/tracks/2K0HJOlFioRN...,https://api.spotify.com/v1/audio-analysis/2K0H...,4
2891,778,117,Frankie J,spotify:track:7iQY2alDQ2MVya5up2hDgq,spotify:artist:3sMYEBy0CZFxedcnm9i9hf,Obsesion (No Es Amor) - Featuring Baby Bash,spotify:album:27ybknbObpx7ZWiaWvSF36,225426,Playlist: The Very Best Of Frankie J,447,...,0.0851,0.000000,0.3250,0.559,75.038,audio_features,7iQY2alDQ2MVya5up2hDgq,https://api.spotify.com/v1/tracks/7iQY2alDQ2MV...,https://api.spotify.com/v1/audio-analysis/7iQY...,4
2892,778,118,Banda Los Recoditos,spotify:track:1PPswVdqYYvfBIk5MdB4Lo,spotify:artist:4bPiOPI4V99cepEftvBYak,Me Está Gustando,spotify:album:3LLGf8yNKPKud3ErCdOkAu,180146,Me Está Gustando,447,...,0.4990,0.000000,0.1940,0.966,144.903,audio_features,1PPswVdqYYvfBIk5MdB4Lo,https://api.spotify.com/v1/tracks/1PPswVdqYYvf...,https://api.spotify.com/v1/audio-analysis/1PPs...,3
