In [1]:
import os
import sys
import glob
import time
import datetime
import math
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import random
import scipy as sp

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from sklearn.preprocessing import StandardScaler

from functools import partial
from multiprocessing.pool import Pool
from time import time

np.random.seed(109)

# Read in CSVs | Add FID | Concatenate

In [28]:
random.seed(109)
file_ids = random.sample(range(1000), 2)
file_list = [f"songs{i}.csv" for i in file_ids]

In [29]:
file_ids

[286, 234]

In [30]:
files_list = []
for i in range(len(file_list)):
    file_path = "Songs/" + file_list[i]
    file = pd.read_csv(file_path)
    file["fid"] = file_ids[i]
    files_list.append(file)
complete_playlists = pd.DataFrame(np.concatenate(files_list), columns = file.columns)
complete_playlists.head()

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,fid
0,0,0,Sam Smith,spotify:track:7Hz6LLOVxrojLPIHJJ1S0E,spotify:artist:2wY79sveU1sp5g7SokKOiI,Have Yourself A Merry Little Christmas,spotify:album:6aT8FGBTfyAhIDQ0IDaebz,170964,Have Yourself A Merry Little Christmas,286
1,0,1,Bing Crosby,spotify:track:6Z8bwTp3CPk8fAomqEJZHc,spotify:artist:6ZjFtWeHP9XN7FeKSUe80S,Have Yourself A Merry Little Christmas,spotify:album:0RybBCL0oKCwyKur97W2gH,169506,Winter Wonderland,286
2,0,2,Dean Martin,spotify:track:7H0ABcbcZLjVD83SgeU19l,spotify:artist:49e4v89VmlDcFCMyDv9wQ9,White Christmas,spotify:album:5I48ENiZiaZZSOpec6PdS5,149920,A Winter Romance,286
3,0,3,Vince Guaraldi Trio,spotify:track:34rzJNfTd5e9s3TICFMPD0,spotify:artist:4ytkhMSAnrDP8XzRNlw9FS,O Tannenbaum,spotify:album:7DuJYWu66RPdcekF5TuZ7w,309386,A Charlie Brown Christmas,286
4,0,4,Michael Bublé,spotify:track:0lLdorYw7lVrJydTINhWdI,spotify:artist:1GxkXlMwML1oSg5eLPiAz3,It's Beginning To Look A Lot Like Christmas,spotify:album:3CKVXhODttZebJAzjUs2un,206346,Christmas,286


Playlist (286, 0) is a Christmas playlist!! Maybe use this to test to stay with the holiday season?

# Set index to ["fid", "pid"]

In [31]:
complete_playlists.set_index(["fid", "pid"], inplace=True)
complete_playlists.sort_index(inplace = True) # Sort index to make indexing faster later

# Get all unique songs, artists, and albums for Collaborative Filtering

In [32]:
all_unique_songs = complete_playlists['track_name'].unique()
all_unique_albums = complete_playlists['album_name'].unique()
all_unique_artists = complete_playlists['artist_name'].unique()

# Make unique playlist ID's and split into train and test

In [33]:
unique_playlist_ids = np.unique(complete_playlists.index.values)
np.random.shuffle(unique_playlist_ids)

# 0.8 Test and 0.2 Train Split
train_idx_cutoff = int(0.8 * unique_playlist_ids.shape[0])
train_playlist_ids = unique_playlist_ids[0:train_idx_cutoff]
test_playlist_ids = unique_playlist_ids[train_idx_cutoff:unique_playlist_ids.shape[0]]

# Create Sparse Matrix

In [34]:
# Create sparse matrix based on passed in filter methods
def createSparseMatrix(train_playlist_ids, all_candidates_filter, filter_method = "track_name"):    
    rows_list = []
    for ids in train_playlist_ids:
        playlist_filter = complete_playlists.loc[pd.IndexSlice[ids]][filter_method]
        ones_idx = np.where(np.isin(all_candidates_filter, playlist_filter))[0]
        row = np.zeros(all_candidates_filter.shape)
        row[ones_idx] = 1 / playlist_filter.shape[0]
        rows_list.append(row)
    train_sparse = sp.sparse.csr_matrix(np.vstack(rows_list))
    return train_sparse

# Create Sparse Matricies

In [35]:
song_sm = createSparseMatrix(train_playlist_ids, all_unique_songs, "track_name")
artist_sm = createSparseMatrix(train_playlist_ids, all_unique_artists, "artist_name")
album_sm = createSparseMatrix(train_playlist_ids, all_unique_albums, "album_name")

# kNN with Collaborative Filtering on Songs

In [36]:
model_knn_songs = NearestNeighbors(metric='cosine', 
                                   algorithm='brute', 
                                   n_neighbors=50, n_jobs=-1).fit(song_sm)
model_knn_albums = NearestNeighbors(metric='cosine', 
                                    algorithm='brute', 
                                    n_neighbors=50, n_jobs=-1).fit(album_sm)
model_knn_artists = NearestNeighbors(metric='cosine', 
                                     algorithm='brute', 
                                     n_neighbors=50, n_jobs=-1).fit(artist_sm)

# Save Sparse Matricies

In [37]:
# sp.sparse.save_npz("sparse_train_tracks_final.npz", song_sm)
# sp.sparse.save_npz("sparse_train_artists_final.npz", artist_sm)
# sp.sparse.save_npz("sparse_train_albums_final.npz", album_sm)

# Create Sparse Vector for Test Playlists

In [38]:
def createSparseVector(test_playlist, all_candidates_filter, filter_method = "track_name"):
    # Takes a playlist and creates a sparse vector
    test_playlist_filter = test_playlist[filter_method]
    ones_idx = np.where(np.isin(all_candidates_filter, test_playlist_filter))
    row = np.zeros(all_candidates_filter.shape)
    row[ones_idx] = 1
    return row

# Split Test Set into Train and Validation

test_playlist_ids: These are the ids for the playlists that we want to make predictions on
- We split each test playlist into a "train" and "validation" set. 
- We create sparse matrices from the train part, to create a similarity value for the test playlist to existing playlists in our database. 3 are created for our 3 collaborative filtering techniques
- Then, we use the similarity value to choose playlists that are most similar and recommend the songs from that playlist

In [39]:
random.seed(109)
# Sparse matrices for the three collaborative filtering techniques
song_sm_list = []
album_sm_list = []
artist_sm_list = []
# Creating a list of validation songs that we later test against
val_list = []
test_list = []

# For each playlist in test playlists
for ids in test_playlist_ids:
    test_playlist = complete_playlists.loc[ids]
    
    # Split into test songs and validation songs
    idx = np.random.choice(np.arange(len(test_playlist)), int(0.8 * len(test_playlist)), replace=False)
    test_idx = np.where(np.isin(np.arange(len(test_playlist)), idx, invert=True))[0]
    predictor_playlist = test_playlist.iloc[idx]
    evaluation_playlist = test_playlist.iloc[test_idx]
    val_list.append(evaluation_playlist)
    test_list.append(predictor_playlist)
    
    # Make the sparse vector for each test playlist - cf on songs
    sparse_song_row = createSparseVector(predictor_playlist, all_unique_songs, "track_name")
    song_sm_list.append(sparse_song_row)
    
    # Make the sparse vector for each test playlist - cf on album
    sparse_album_row = createSparseVector(predictor_playlist, all_unique_albums, "album_name")
    album_sm_list.append(sparse_album_row)
    
    # Make the sparse vector for each test playlist - cf on artist
    sparse_artist_row = createSparseVector(predictor_playlist, all_unique_artists, "artist_name")
    artist_sm_list.append(sparse_artist_row)

# Compile all vectors for test playlists into a sparse matrix
test_song_sm = sp.sparse.csr_matrix(np.vstack(song_sm_list))
test_album_sm = sp.sparse.csr_matrix(np.vstack(album_sm_list))
test_artist_sm = sp.sparse.csr_matrix(np.vstack(artist_sm_list))

# Store the validation sets of each playlist
val_df = pd.concat(val_list)
test_indices = test_playlist_ids

In [230]:
# sp.sparse.save_npz("sparse_test_tracks_final.npz", test_song_sm)
# sp.sparse.save_npz("sparse_test_artists_final.npz", test_artist_sm)
# sp.sparse.save_npz("sparse_test_albums_final.npz", test_album_sm)

# Prediction Function

In [14]:
def getSongPredictions(X, model, train_indices, candidate_playlists, top_n, test_list, filter_method):
    distances, neighbors = model.kneighbors(X)
    predictions_list = []
    # For each playlist in the sparse matrix passed in
    for i in range(len(test_list)):
        best_candidates = neighbors[i][distances[i] != 1]
        best_distances = distances[i][distances[i] != 1]
        if best_candidates.size != 0:
            # get the IDs in candidate_playlists [our database of playlists] with the lowest distance
            candidate_idx = train_indices[best_candidates]
        else:
            # TODO: Change
            predictions_list.append(pd.DataFrame(columns = candidate_playlists.columns))
            continue
            
        candidates_df = candidate_playlists.loc[candidate_idx]
        
        # transform the best_distances into a form that can be concatenated
        lengths = candidates_df.groupby(level=[0,1])["pos"].count().values
        candidates_df[f"{filter_method}_distance"] = np.repeat(best_distances, lengths)
        
        # want only the songs that are not in the test playlist
        idx_songs_in_playlist = np.where(np.isin(candidates_df["track_name"].values,
                                                 test_list[i]["track_name"].values, invert=True))[0]
        not_in_train_df = candidates_df.loc[idx_songs_in_playlist]
        
        # drop duplicates from final recommendation list
        no_duplicates_df = not_in_train_df.drop_duplicates("track_name")
        if no_duplicates_df.shape[0] < top_n:
            predictions_list.append(no_duplicates_df)
        else:
            predictions_list.append(no_duplicates_df.iloc[0:top_n])
    return predictions_list

# Prediction with CF on Songs

In [43]:
predictions_cf_song = getSongPredictions(test_song_sm, model_knn_songs, 
                                         train_playlist_ids, complete_playlists, 
                                         500, test_list, "song")

In [44]:
np.mean([i.shape[0] for i in predictions_cf_song])

477.9425

# Prediction with CF on Artists

In [45]:
predictions_cf_artist = getSongPredictions(test_artist_sm, model_knn_artists, 
                                           train_playlist_ids, complete_playlists, 
                                           500, test_list, "artist")

In [46]:
np.mean([i.shape[0] for i in predictions_cf_artist])

485.4325

# Prediction with CF on Album

In [47]:
predictions_cf_album = getSongPredictions(test_album_sm, model_knn_albums, 
                                          train_playlist_ids, complete_playlists, 
                                          500, test_list, "album")

In [48]:
np.mean([i.shape[0] for i in predictions_cf_album])

477.6325

# Evaluate CF on Songs

In [15]:
def clickEvaluation(val_list, predictions_list):
    clicks = []
    for i in range(len(val_list)):
        predictions = predictions_list[i]
        val_playlist = val_list[i]
        positions_matching = np.where(np.isin(predictions["track_name"].values, val_playlist['track_name'].values))[0]
        if positions_matching.size == 0:
            clicks.append(50)
        else:
            clicks.append(math.floor(positions_matching[0] / 10))
    return np.mean(clicks)

In [82]:
clickEvaluation(val_list, predictions_cf_song)

16.805

# Evaluate CF on Artists

In [83]:
clickEvaluation(val_list, predictions_cf_artist)

16.0775

# Evaluate CF on Albums

In [84]:
clickEvaluation(val_list, predictions_cf_album)

15.39

# Load Song Info

In [16]:
tracks_with_info = pd.read_csv("total_songs_clean.csv")

In [17]:
tracks_with_info.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,type,id,uri,track_href,analysis_url,duration_ms,time_signature,artist_name,track_name,album_name
0,0.369,0.223,9,-15.993,1,0.0564,0.957,0.86,0.124,0.212,...,audio_features,3R8bbBLfCrl3wXELLDuZiA,spotify:track:3R8bbBLfCrl3wXELLDuZiA,https://api.spotify.com/v1/tracks/3R8bbBLfCrl3...,https://api.spotify.com/v1/audio-analysis/3R8b...,196733,4,Kagan Publishing,The Road to Lucia,Brain Boosters: Inspired Thinking
1,0.664,0.605,4,-5.477,0,0.258,0.242,0.0,0.161,0.397,...,audio_features,5Zk7ELDEXqcHZk2euHPVb9,spotify:track:5Zk7ELDEXqcHZk2euHPVb9,https://api.spotify.com/v1/tracks/5Zk7ELDEXqcH...,https://api.spotify.com/v1/audio-analysis/5Zk7...,192320,4,Cherine Anderson,Coming Over Tonight (feat. Chuck Fender),The Introduction - EP
2,0.74,0.712,9,-7.26,0,0.0814,0.0718,0.0,0.33,0.742,...,audio_features,5tg8mN8KqRfwIvkXF0tRr4,spotify:track:5tg8mN8KqRfwIvkXF0tRr4,https://api.spotify.com/v1/tracks/5tg8mN8KqRfw...,https://api.spotify.com/v1/audio-analysis/5tg8...,276093,4,Kirk Franklin,Keep Your Head,Hero
3,0.623,0.863,1,-8.161,0,0.0776,0.000811,2e-06,0.128,0.734,...,audio_features,10qeFHxbayfrkH2kX9kTnv,spotify:track:10qeFHxbayfrkH2kX9kTnv,https://api.spotify.com/v1/tracks/10qeFHxbayfr...,https://api.spotify.com/v1/audio-analysis/10qe...,136067,4,Limp,Bug Dance,Pop & Disorderly
4,0.413,0.0302,10,-26.692,1,0.035,0.957,0.443,0.113,0.254,...,audio_features,18LVcYNiF05oDfEQ7HEhNR,spotify:track:18LVcYNiF05oDfEQ7HEhNR,https://api.spotify.com/v1/tracks/18LVcYNiF05o...,https://api.spotify.com/v1/audio-analysis/18LV...,233627,4,Ludwig van Beethoven,Beethoven : Symphony No.8 in F major Op.93 : I...,Beethoven : Symphonies Nos 1 - 9


In [18]:
matching_songs = complete_playlists.loc[complete_playlists["track_uri"].isin(tracks_with_info["uri"]), "track_uri"].unique()

In [19]:
matching_songs.shape

(34227,)

In [20]:
missing_songs = complete_playlists.loc[~complete_playlists["track_uri"].isin(tracks_with_info["uri"]), "track_uri"].unique()

In [21]:
missing_songs.shape

(25615,)

In [22]:
all_unique_songs.shape

(50757,)

# Need to Scrape the Remaining Songs

In [23]:
to_fetch = []
for track in missing_songs:
    to_fetch.append(track.split(":")[-1])

In [73]:
track_ids_filtered = to_fetch
api_key = "BQBzq0fYBAPzwjLdM2G9NmhFkLrN-oI8TwkTlHqvATb-zCYxYqOwRP99Ik7VlJY4grVO0JT_9f04TMs8Yfx42Imnya4hOXCYvlc6w9qsuVRXwXPGGQAl9rTOQuWpLY-BO1Xepx5EN0mUpPJQ1NkawmTtm4muqXauvOs"

In [75]:
BASE_URL = "https://api.spotify.com/v1/audio-features?ids="
features_dfs = []
for i in range(0, len(track_ids_filtered), 100):
    # print(i)
    end = i+100 if i+100 < len(track_ids_filtered) else len(track_ids_filtered)
    response = requests.get(BASE_URL + ",".join(track_ids_filtered[i:end]), 
                            headers = {"Authorization": "Bearer " + api_key})
    while response.status_code != 200:
        if response.status_code == 429:
            retry_after = int(response.headers['Retry-After'])
            print(f"Too many requests. Sleeping for {retry_after} seconds.")
            time.sleep(retry_after)
        else:
            print("Request failed! Trying again in 2 seconds.")
            time.sleep(2)
        response = requests.get(BASE_URL + ",".join(track_ids_filtered[i:end]), 
                                headers = {"Authorization": "Bearer " + api_key})
    if "audio_features" in response.json():
        features_dfs.append(response.json()["audio_features"])

In [76]:
# filter out jsons that are None!
df_list = []
for request in features_dfs:
    for i in request:
        if i:
            df_list.append(i)
fetched_songs = pd.DataFrame(df_list)

In [96]:
name_uri = complete_playlists.loc[complete_playlists["track_uri"].isin(fetched_songs["uri"]), ["artist_name", "track_name", "album_name", "track_uri"]].groupby("track_uri").first().reset_index()

In [99]:
name_uri = name_uri.sort_values("track_uri")

In [105]:
fetched_songs = fetched_songs.sort_values("uri").reset_index(drop=True)

In [109]:
fetched_songs["artist_name"] = name_uri.loc[name_uri["track_uri"] == fetched_songs["uri"], "artist_name"]
fetched_songs["track_name"] = name_uri.loc[name_uri["track_uri"] == fetched_songs["uri"], "track_name"]
fetched_songs["album_name"] = name_uri.loc[name_uri["track_uri"] == fetched_songs["uri"], "album_name"]

In [111]:
fetched_songs.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,type,id,uri,track_href,analysis_url,duration_ms,time_signature,artist_name,track_name,album_name
0,0.631,0.513,2,-6.376,1,0.0293,0.366,4e-06,0.109,0.307,...,audio_features,000DfZJww8KiixTKuk9usJ,spotify:track:000DfZJww8KiixTKuk9usJ,https://api.spotify.com/v1/tracks/000DfZJww8Ki...,https://api.spotify.com/v1/audio-analysis/000D...,357573,4,Mike Love,Earthlings,The Change I'm Seeking
1,0.496,0.0481,7,-26.668,0,0.052,0.979,0.767,0.11,0.125,...,audio_features,000JBgYWfJQqdFaRqu2n3f,spotify:track:000JBgYWfJQqdFaRqu2n3f,https://api.spotify.com/v1/tracks/000JBgYWfJQq...,https://api.spotify.com/v1/audio-analysis/000J...,227627,4,Joe Pass,Li'l Darlin',Portrait
2,0.48,0.484,7,-8.627,0,0.0265,0.525,0.28,0.0897,0.418,...,audio_features,000WiulMHlYGJ3fQKnRkfz,spotify:track:000WiulMHlYGJ3fQKnRkfz,https://api.spotify.com/v1/tracks/000WiulMHlYG...,https://api.spotify.com/v1/audio-analysis/000W...,257347,4,Smooth Jazz All Stars,No One Will Do,Smooth Jazz Tribute To Mary J. Blige Volume 2
3,0.509,0.803,0,-6.743,1,0.04,0.684,0.000539,0.463,0.651,...,audio_features,000xYdQfIZ4pDmBGzQalKU,spotify:track:000xYdQfIZ4pDmBGzQalKU,https://api.spotify.com/v1/tracks/000xYdQfIZ4p...,https://api.spotify.com/v1/audio-analysis/000x...,187119,4,Luan Santana,"Eu, Você, O Mar e Ela",1977
4,0.531,0.903,7,-3.006,1,0.045,0.0135,0.0,0.113,0.316,...,audio_features,0012kFta5wCFogveUeJN2z,spotify:track:0012kFta5wCFogveUeJN2z,https://api.spotify.com/v1/tracks/0012kFta5wCF...,https://api.spotify.com/v1/audio-analysis/0012...,203733,4,Deaf Havana,Subterranean Bullshit Blues,Old Souls


In [122]:
tracks_with_info = tracks_with_info.groupby("uri").first().reset_index()

In [125]:
matching_track_df = tracks_with_info.loc[tracks_with_info["uri"].isin(matching_songs)]

In [136]:
all_acoustic_features = pd.concat([matching_track_df, fetched_songs], axis=0, sort=True)

In [137]:
all_acoustic_features.to_csv("acoustic_features_10k_playlists.csv", index=False)

In [139]:
all_tracks = pd.concat([tracks_with_info, fetched_songs], axis=0, sort=True)

In [140]:
all_tracks.to_csv("total_songs_clean_v2.csv", index=False)

# Explore

In [153]:
all_acoustic_features["track_name"]

0              Still Got Time
1            It's Alright Now
2                   Wet Jeans
3                      Ginger
5          Yee - Original Mix
                 ...         
110476              Piano Boy
110477         Fear & Delight
110478                Iceberg
110479    Let's Go Fly a Kite
110480            Snow Fields
Name: track_name, Length: 174190, dtype: object

In [155]:
all_unique_songs

array(['Vestiges', 'River - Live at SXSW 2015', 'Man On Fire', ...,
       "Rock 'N' Roll All Nite - Live", 'All Apologies - Demo',
       'Polly - Live In London / 1989'], dtype=object)

In [247]:
np.save("test_playlist_ids", train_playlist_ids)

# Make a training matrix for NN

In [88]:
def AdjustedRPrecision(predictions_list, val_list):
    # Takes a list of df's of predicted songs and a list of df's of validation playlist
    adjusted_r_precision_list = list()
    precision_list = list()
    valid_predictions_list = list()
    for i in range(len(predictions_list)):
        if len(predictions_list[i]) == 0:
            adjusted_r_precision_list.append(0)
            continue
        valid_song = np.isin(predictions_list[i]["track_name"], val_list[i]["track_name"]) 
        valid_artist = np.isin(predictions_list[i]["artist_name"], val_list[i]["artist_name"])
        valid_album = np.isin(predictions_list[i]["album_name"], val_list[i]["album_name"])
        valid_predictions = (valid_song | valid_artist | valid_album).astype(int)
        valid_predictions_list.append(valid_predictions)
        precision_list.append(valid_song.astype('int'))
        adjusted_r_precision_list.append(sum(valid_predictions) / predictions_list[i].shape[0])
        
    return adjusted_r_precision_list, valid_predictions_list, precision_list

In [40]:
tracks_to_consider = pd.read_csv("acoustic_features_10k_playlists.csv").set_index("uri")

In [41]:
desired_features = ["danceability", "energy", "loudness", 
                      "speechiness", "acousticness", 
                      "instrumentalness", "liveness", "valence", "mode"]

In [103]:
rows_list = []
for i in range(len(predictions_cf_song)):
    to_concat = []
    if predictions_cf_song[i].shape[0] != 0:
        to_concat.append(predictions_cf_song[i])
    if predictions_cf_album[i].shape[0] != 0:
        to_concat.append(predictions_cf_album[i])
    if predictions_cf_artist[i].shape[0] != 0:
        to_concat.append(predictions_cf_artist[i])
    if not to_concat:
        rows_list.append([])
        continue
        
    total = pd.concat(to_concat, axis=0, sort=False)
    if "song_distance" not in total.columns:
        total["song_distance"] = np.nan
    if "artist_distance" not in total.columns:
        total["artist_distance"] = np.nan
    if "album_distance" not in total.columns:
        total["album_distance"] = np.nan
        
    total_grouped = total.groupby("track_name", sort=False)
    unique_preds = total.drop_duplicates("track_name").copy().reset_index()
    unique_preds['song_distance'] = total_grouped['song_distance'].max().reset_index(drop=True)
    unique_preds['artist_distance'] = total_grouped['artist_distance'].max().reset_index(drop=True)
    unique_preds['album_distance'] = total_grouped['album_distance'].max().reset_index(drop=True)
    unique_preds.set_index(["fid", "pid"], inplace=True)
    unique_preds.sort_index(inplace=True)
    
    # get where we are missing values
    missing_vals_idx = np.unique(unique_preds.loc[unique_preds['song_distance'].isna() 
                                         | unique_preds['artist_distance'].isna()
                                         | unique_preds['album_distance'].isna()].index)
    
    for idx in missing_vals_idx:
        train_row = np.where([train_id == idx for train_id in train_playlist_ids])[0][0]
        song_val = cosine_distances(song_sm[train_row], test_song_sm[i])[0][0]
        artist_val = cosine_distances(artist_sm[train_row], test_artist_sm[i])[0][0]
        album_val = cosine_distances(album_sm[train_row], test_album_sm[i])[0][0]
        unique_preds.loc[idx, ('song_distance', 'artist_distance', 'album_distance')] = (song_val, artist_val, album_val)
        
    # add cosine similarity with musical features
    pred_feats = tracks_to_consider.loc[unique_preds['track_uri'], desired_features]
    test_feats = tracks_to_consider.loc[test_list[i]['track_uri'], desired_features]
    avg_sim = np.mean(cosine_similarity(test_feats, pred_feats), axis=0)
    unique_preds['musical_similarity'] = avg_sim
    unique_preds['num_seed_songs'] = test_list[i].shape[0]
    
    # Add validation fid, pid
    unique_preds['val_fid'] = val_list[i].index[0][0]
    unique_preds['val_pid'] = val_list[i].index[0][1]
    
    rows_list.append(unique_preds)

In [104]:
len(rows_list)

400

In [105]:
adjusted_r_precision_list, valid_predictions_list, precision_list = AdjustedRPrecision(rows_list, val_list)

In [106]:
np.mean(adjusted_r_precision_list)

0.06766370530758968

In [115]:
response = np.concatenate(valid_predictions_list)

In [116]:
filtered_rows = []
for row in rows_list:
    if len(row) > 0:
        filtered_rows.append(row)

In [117]:
nn_train_df = pd.concat(filtered_rows, sort=False)

In [118]:
nn_train_df['response'] = response

In [119]:
scaled_duration = StandardScaler().fit_transform(nn_train_df['duration_ms'].values.reshape(-1,1))

In [120]:
nn_train_df['scaled_duration'] = scaled_duration.reshape(-1,)

In [121]:
final_features_train = nn_train_df[['scaled_duration', 'num_seed_songs', 'song_distance', 'artist_distance', 'album_distance', 'musical_similarity', 'response', 'val_fid', 'val_pid']]

In [122]:
final_features_train.to_csv("nn_test_rpres.csv")