In [32]:
import os
import sys
import glob
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import math
import random
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
class Baseline:
    def __init__(self, X_train, X_test, songs_df, candidate_song_ids = None, 
                 candidate_percentage = 0.3, similarity = "cosine"):
        self.X_train = X_train
        self.X_test = X_test
        self.songs_df = songs_df
        sids = set(songs_df["id"])
        self.sids = sids
        if candidate_song_ids:
            # make sure the ids are in the songs_df
            assert(sum([0 if i in sids else 1 for i in candidate_songs_ids]) == 0)
            self.candidate_songs = songs_df.loc[songs_df["id"].isin(candidate_song_ids)]
        else:
            self.setCandidateSongs(candidate_percentage)
        
        if similarity == "cosine":
            self.sim_func = cosine_similarity
        else:
            raise NotImplementedError("This similarity function is not currently supported.")
            
        # used song set?
        self.used_song_ids = set()
        
        
    def setCandidateSongs(self, percentage):
        ids = random.sample(list(self.sids), int(percentage * len(self.sids)))
        self.candidate_songs = self.songs_df.loc[self.songs_df["id"].isin(ids)]
        
    
    def predict(self, seed_songs_df, num_songs, feats_to_use):
        songs_to_consider = self.candidate_songs.loc[-self.candidate_songs["uri"].isin(list(seed_songs_df["uri"]))]
        similarities = self.sim_func(seed_songs_df[feats_to_use].values, 
                                     songs_to_consider[feats_to_use].values)
        avg_sim = np.mean(similarities, axis = 0)
        desc_order = np.argsort(-avg_sim)
        songs_desc = songs_to_consider.iloc[desc_order]
        return songs_desc[0:num_songs]

# Perform Content-Based Filtering

In [6]:
random.seed(109)
file_ids = random.sample(range(1000), 10)
file_list = [f"songs{i}.csv" for i in file_ids]
file_ids

[286, 234, 470, 460, 560, 59, 103, 233, 514, 509]

In [7]:
files_list = []
for i in range(len(file_list)):
    file_path = "Songs/" + file_list[i]
    file = pd.read_csv(file_path)
    file["fid"] = file_ids[i]
    files_list.append(file)
complete_playlists = pd.DataFrame(np.concatenate(files_list), columns = file.columns)
complete_playlists.head()

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,fid
0,0,0,Sam Smith,spotify:track:7Hz6LLOVxrojLPIHJJ1S0E,spotify:artist:2wY79sveU1sp5g7SokKOiI,Have Yourself A Merry Little Christmas,spotify:album:6aT8FGBTfyAhIDQ0IDaebz,170964,Have Yourself A Merry Little Christmas,286
1,0,1,Bing Crosby,spotify:track:6Z8bwTp3CPk8fAomqEJZHc,spotify:artist:6ZjFtWeHP9XN7FeKSUe80S,Have Yourself A Merry Little Christmas,spotify:album:0RybBCL0oKCwyKur97W2gH,169506,Winter Wonderland,286
2,0,2,Dean Martin,spotify:track:7H0ABcbcZLjVD83SgeU19l,spotify:artist:49e4v89VmlDcFCMyDv9wQ9,White Christmas,spotify:album:5I48ENiZiaZZSOpec6PdS5,149920,A Winter Romance,286
3,0,3,Vince Guaraldi Trio,spotify:track:34rzJNfTd5e9s3TICFMPD0,spotify:artist:4ytkhMSAnrDP8XzRNlw9FS,O Tannenbaum,spotify:album:7DuJYWu66RPdcekF5TuZ7w,309386,A Charlie Brown Christmas,286
4,0,4,Michael Bublé,spotify:track:0lLdorYw7lVrJydTINhWdI,spotify:artist:1GxkXlMwML1oSg5eLPiAz3,It's Beginning To Look A Lot Like Christmas,spotify:album:3CKVXhODttZebJAzjUs2un,206346,Christmas,286


In [4]:
tracks_to_consider = pd.read_csv("acoustic_features_10k_playlists.csv")

In [5]:
tracks_to_consider.head()

Unnamed: 0,acousticness,album_name,analysis_url,artist_name,danceability,duration_ms,energy,id,instrumentalness,key,...,loudness,mode,speechiness,tempo,time_signature,track_href,track_name,type,uri,valence
0,0.131,Still Got Time,https://api.spotify.com/v1/audio-analysis/000x...,ZAYN,0.748,188491,0.627,000xQL6tZNLJzIrtIgxqSl,0.0,7,...,-6.029,1,0.0644,120.963,4,https://api.spotify.com/v1/tracks/000xQL6tZNLJ...,Still Got Time,audio_features,spotify:track:000xQL6tZNLJzIrtIgxqSl,0.524
1,0.108,"So Long, See You Tomorrow",https://api.spotify.com/v1/audio-analysis/0010...,Bombay Bicycle Club,0.527,249947,0.793,0010mZpCCwlPwoBiBsjoac,3e-06,3,...,-4.823,1,0.0352,124.994,3,https://api.spotify.com/v1/tracks/0010mZpCCwlP...,It's Alright Now,audio_features,spotify:track:0010mZpCCwlPwoBiBsjoac,0.597
2,0.061,Wet Jeans,https://api.spotify.com/v1/audio-analysis/001B...,Amindi K. Fro$t,0.775,205256,0.355,001BVhvaZTf2icV88rU3DA,0.0137,6,...,-11.755,0,0.158,87.997,4,https://api.spotify.com/v1/tracks/001BVhvaZTf2...,Wet Jeans,audio_features,spotify:track:001BVhvaZTf2icV88rU3DA,0.356
3,2.2e-05,Back On Top,https://api.spotify.com/v1/audio-analysis/001m...,The Front Bottoms,0.393,194320,0.878,001m5KK2fu67yZ5ZW46LDZ,0.000106,1,...,-3.705,1,0.04,152.283,4,https://api.spotify.com/v1/tracks/001m5KK2fu67...,Ginger,audio_features,spotify:track:001m5KK2fu67yZ5ZW46LDZ,0.5
4,0.102,Yee,https://api.spotify.com/v1/audio-analysis/002K...,Deorro,0.732,210000,0.989,002KbOVOX22zbNDwwnA0Wi,0.748,5,...,-4.172,0,0.0483,127.984,4,https://api.spotify.com/v1/tracks/002KbOVOX22z...,Yee - Original Mix,audio_features,spotify:track:002KbOVOX22zbNDwwnA0Wi,0.684


In [10]:
complete_playlists.set_index(["fid", "pid"], inplace=True)
complete_playlists.sort_index(inplace=True)

unique_playlist_ids = np.unique(complete_playlists.index.values)
np.random.shuffle(unique_playlist_ids)

# 0.8 Test and 0.2 Train Split
train_idx_cutoff = int(0.8 * unique_playlist_ids.shape[0])
train_playlist_ids = unique_playlist_ids[0:train_idx_cutoff]
test_playlist_ids = unique_playlist_ids[train_idx_cutoff:unique_playlist_ids.shape[0]]

# Use All Songs as Candidates

In [11]:
test_playlist_ids

array([(59, 413), (233, 381), (514, 441), ..., (509, 118), (286, 897),
       (233, 964)], dtype=object)

In [24]:
seed_list = []
val_list = []
for ids in test_playlist_ids:
    test_playlist = complete_playlists.loc[pd.IndexSlice[ids]]
    # Split into test songs and validation songs
    idx = np.random.choice(np.arange(len(test_playlist)), int(0.8 * len(test_playlist)), replace=False)
    test_idx = np.where(np.isin(np.arange(len(test_playlist)), idx, invert=True))[0]
    predictor_playlist = test_playlist.iloc[idx]
    evaluation_playlist = test_playlist.iloc[test_idx]
    seed_list.append(tracks_to_consider.loc[tracks_to_consider["uri"].isin(predictor_playlist["track_uri"])])
    val_list.append(evaluation_playlist)

In [36]:
def clickEvaluation(validation_playlist_df, predictions_df, test_indices):
    clicks = []
    for i in range(test_indices.shape[0]):
        predictions = predictions_df[i]
        idx = test_indices[i]
        val_playlist = validation_playlist_df[i]
        positions_matching = np.where(np.isin(predictions["track_name"].values, val_playlist['track_name']))[0]
        # CLICKS: min pos / 10
        if positions_matching.size == 0:
            clicks.append(50)
        else:
            clicks.append(math.floor(positions_matching[0] / 10))
    return np.mean(clicks)

In [None]:
available_features = ["danceability", "energy", "loudness", 
                      "speechiness", "acousticness", 
                      "instrumentalness", "liveness", "valence", "mode"]

In [28]:
predictions_list = []
for i in range(len(seed_list)):
    evaluate = Baseline("not", "implemented", tracks_to_consider, candidate_percentage=1)
    predictions = evaluate.predict(seed_list[i], 500, available_features).reset_index()
    predictions_list.append(predictions)

In [33]:
clickEvaluation(val_list, predictions_list, test_playlist_ids)

43.5085

In [43]:
def AdjustedRPrecision(predictions_list, val_list):
    # Takes a list of df's of predicted songs and a list of df's of validation playlist
    adjusted_r_precision_list = list()
    valid_predictions_list = list()
    for i in range(len(predictions_list)):
        valid_song = np.isin(predictions_list[i]["track_name"], val_list[i]["track_name"]) 
        valid_artist = np.isin(predictions_list[i]["artist_name"], val_list[i]["artist_name"])
        valid_album = np.isin(predictions_list[i]["album_name"], val_list[i]["album_name"])
        valid_predictions = (valid_song | valid_artist | valid_album).astype(int)
        valid_predictions_list.append(valid_predictions)
        if predictions_list[i].size != 0:
            adjusted_r_precision_list.append(sum(valid_predictions) / predictions_list[i].shape[0])
        else:
            adjusted_r_precision_list.append(0)
    return adjusted_r_precision_list, valid_predictions_list

In [45]:
adjusted_r_precision_list, valid_predictions_list = AdjustedRPrecision(predictions_list, val_list)

In [47]:
np.mean(adjusted_r_precision_list)

0.007404000000000001