In [1]:
import os
import sys
import glob
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import scipy as sp

Outline:
* Want to create a training/test set to try CF on
* Create SciPy sparse matrix from unique songs
    - Rows: Playlists
    - Columns: Songs in a given playlist
* Filter + predict songs that are in the other playlists that have lowest distance

POSSIBLE CHANGE: If a song appears multiple times in a single playlist, use number of occurrences rather than just presence

In [2]:
total_songs = pd.read_csv("total_songs_clean.csv")

In [3]:
complete_playlists = pd.read_csv("complete_playlists_50.csv")

## Get train/test playlists

In [4]:
unique_playlist_ids = complete_playlists[["pid", "fid"]].groupby(['pid', "fid"]).first().reset_index()

In [5]:
train_playlist_ids = unique_playlist_ids.sample(45)

In [6]:
test_playlist_ids = unique_playlist_ids.loc[-(unique_playlist_ids["pid"].isin(train_playlist_ids["pid"]) & 
                                              unique_playlist_ids["fid"].isin(train_playlist_ids["fid"]))]

In [7]:
test_playlist_ids.head()

Unnamed: 0,pid,fid
6,190,113
37,824,342
39,844,858
42,869,436
44,905,965


## Create SciPy Spare Matrix

In [8]:
col_names = complete_playlists['track_name'].unique()
train_indices = []
complete_playlists.set_index(["fid", "pid"], inplace=True)

In [9]:
rows_list = []
for fid, pid in zip(train_playlist_ids["fid"], train_playlist_ids["pid"]):
    train_indices.append((fid, pid))
    playlist_songs = complete_playlists.loc[pd.Index([fid, pid]), "track_name"].values
    ones_idx = np.where(np.in1d(col_names, playlist_songs))[0]
    row = np.zeros(col_names.shape)
    row[ones_idx] = 1
    rows_list.append(row)
train_df = sp.sparse.csr_matrix(np.vstack(rows_list))
train_indices = np.array(train_indices)

## Use kNN with cosine distance to find nearest neighbors

In [10]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1).fit(train_df)

## Actually make predictions!

In [11]:
test_indices = []
rows_list = []
val_list = []
for fid, pid in zip(test_playlist_ids["fid"], test_playlist_ids["pid"]):
    test_indices.append((fid, pid))
    playlist_songs = complete_playlists.loc[pd.Index([fid, pid])]
    idx = np.random.choice(np.arange(len(playlist_songs)), int(0.8 * len(playlist_songs)), replace=False)
    test_idx = np.where(np.in1d(np.arange(len(playlist_songs)), idx, invert=True))[0]
    songs_to_use = playlist_songs["track_name"].values[idx]
    songs_to_evaluate = playlist_songs.iloc[test_idx]
    val_list.append(songs_to_evaluate)
    ones_idx = np.where(np.in1d(col_names, songs_to_use))[0]
    row = np.zeros(col_names.shape)
    row[ones_idx] = 1
    rows_list.append(row)
test_df = sp.sparse.csr_matrix(np.vstack(rows_list))
val_df = pd.concat(val_list)
test_indices = np.array(test_indices)

In [32]:
def getSongPredictions(X, model, indices, candidate_playlists, top_n, col_names):
    distances, neighbors = model.kneighbors(X)
    predictions_list = []
    for i in range(X.shape[0]):
        best_candidates = neighbors[i][distances[i] != 1]
        best_distances = distances[i][distances[i] != 1]
        if best_candidates.size != 0:
            # get the place in candidate_playlists with the lowest distance
            candidate_idx = indices[best_candidates]
        else:
            # TODO: Change
            predictions_list.append(np.array([]))
            continue
        candidates_df = candidate_playlists.loc[pd.Index(candidate_idx)].copy()
        
        # transform the best_distances into a form that can be concatenated
        lengths = candidates_df.groupby(level=[0,1])["pos"].count().values
        candidates_df["distance"] = np.repeat(best_distances, lengths)
        
        # want only the songs that are not in the playlist
        not_in_train_df = candidates_df.iloc[np.where(np.in1d(candidates_df["track_name"].values, 
                                                      col_names[X[i].toarray()[0] == 1], invert=True))[0]]
        
        # drop duplicates
        no_duplicates_df = not_in_train_df.drop_duplicates("track_name")
        if no_duplicates_df.shape[0] < top_n:
            predictions_list.append(no_duplicates_df)
        else:
            predictions_list.append(no_duplicates_df.iloc[0:top_n])
    return predictions_list

In [29]:
X = test_df
top_n = 500
distances, neighbors = model_knn.kneighbors(X)
predictions_list = []
for i in range(X.shape[0]):
    best_candidates = neighbors[i][distances[i] != 1]
    best_distances = distances[i][distances[i] != 1]
    if best_candidates.size != 0:
        candidate_idx = train_indices[best_candidates]
    else:
        predictions_list.append(np.array([]))
        continue   
    
    df = complete_playlists.loc[pd.Index(candidate_idx)].copy()
    lengths = df.groupby(level=[0,1])["pos"].count().values
    df["distance"] = np.repeat(best_distances, lengths)
    not_in_train_df = df.iloc[np.where(np.in1d(df["track_name"].values, 
                                               col_names[X[i].toarray()[0] == 1], invert=True))[0]]
    no_duplicates_df = not_in_train_df.drop_duplicates("track_name")
    if no_duplicates_df.shape[0] < top_n:
        predictions_list.append(no_duplicates_df)
    else:
        predictions_list.append(no_duplicates_df.iloc[0:top_n])

In [14]:
# X, model, test_indicies, candidate_playlists, top_n, col_names

In [33]:
getSongPredictions(test_df, model_knn, train_indices, complete_playlists, 500, col_names)

[         pos            artist_name                             track_uri  \
 fid pid                                                                     
 341 453    0               Hot Date  spotify:track:08R44s3bTSpm3LSnVG2nF2   
     453    1             Atlas Plug  spotify:track:5K9ka2I5o2T0tmFSpysNqW   
     453    2  The Chemical Brothers  spotify:track:6zIkiZqUnSMsuRJ6acyJuD   
     453    3        Yeah Yeah Yeahs  spotify:track:64zjna5dIuO7s9FCDXQMmI   
     453    4               Kid Cudi  spotify:track:7m47Go71qTMBs4kTH7U8F8   
 ...      ...                    ...                                   ...   
 451 640   60                 Eminem  spotify:track:1mavQ4WCzXSeL2Dm5DS4GQ   
     640   61                 Eminem  spotify:track:2bOkfsxbuhFDM1zmUiUxcH   
     640   62                 Eminem  spotify:track:06aw4JI5nYa8XBeBKVrltZ   
     640   63                 Eminem  spotify:track:6sDQ4uiWw9OdVrCXFLSlZt   
     640   64                 Eminem  spotify:track:1o2G8C2gInZn