In [1]:
import os
import sys
import glob
import time
import datetime
import math


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import scipy as sp

Outline:
* Want to create a training/test set to try CF on
* Create SciPy sparse matrix from unique songs
    - Rows: Playlists
    - Columns: Songs in a given playlist
* Filter + predict songs that are in the other playlists that have lowest distance

POSSIBLE CHANGE: If a song appears multiple times in a single playlist, use number of occurrences rather than just presence

TODO: TIME THE RUNTIME

In [2]:
total_songs = pd.read_csv("total_songs_clean.csv")

In [21]:
complete_playlists = pd.read_csv("playlist_data.csv")

## Get train/test playlists

In [27]:
unique_playlist_ids = np.unique(np.array(complete_playlists[["fid", "pid"]]), axis=0)

In [33]:
np.random.shuffle(unique_playlist_ids)

In [34]:
train_idx_cutoff = int(0.8 * unique_playlist_ids.shape[0])
train_playlist_ids = unique_playlist_ids[0:train_idx_cutoff]
test_playlist_ids = unique_playlist_ids[train_idx_cutoff:unique_playlist_ids.shape[0]]

## Create SciPy Spare Matrix

In [None]:
TODO: Change to numpy

In [35]:
col_names = complete_playlists['track_name'].unique()
train_indices = []
complete_playlists.set_index(["fid", "pid"], inplace=True)

KeyboardInterrupt: 

In [9]:
rows_list = []
for fid, pid in zip(train_playlist_ids["fid"], train_playlist_ids["pid"]):
    train_indices.append((fid, pid))
    playlist_songs = complete_playlists.loc[pd.Index([fid, pid]), "track_name"].values
    ones_idx = np.where(np.in1d(col_names, playlist_songs))[0]
    row = np.zeros(col_names.shape)
    row[ones_idx] = 1 / playlist_songs.shape[0]
    rows_list.append(row)
train_df = sp.sparse.csr_matrix(np.vstack(rows_list))
train_indices = np.array(train_indices)

## Use kNN with cosine distance to find nearest neighbors

In [10]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1).fit(train_df)

## Actually make predictions!

In [11]:
test_indices = []
rows_list = []
val_list = []
for fid, pid in zip(test_playlist_ids["fid"], test_playlist_ids["pid"]):
    test_indices.append((fid, pid))
    playlist_songs = complete_playlists.loc[pd.Index([fid, pid])]
    idx = np.random.choice(np.arange(len(playlist_songs)), int(0.8 * len(playlist_songs)), replace=False)
    test_idx = np.where(np.in1d(np.arange(len(playlist_songs)), idx, invert=True))[0]
    songs_to_use = playlist_songs["track_name"].values[idx]
    songs_to_evaluate = playlist_songs.iloc[test_idx]
    val_list.append(songs_to_evaluate)
    ones_idx = np.where(np.in1d(col_names, songs_to_use))[0]
    row = np.zeros(col_names.shape)
    row[ones_idx] = 1 # / songs_to_use.shape[0]
    rows_list.append(row)
test_df = sp.sparse.csr_matrix(np.vstack(rows_list))
val_df = pd.concat(val_list)
test_indices = np.array(test_indices)

In [12]:
def getSongPredictions(X, model, indices, candidate_playlists, top_n, col_names):
    distances, neighbors = model.kneighbors(X)
    predictions_list = []
    for i in range(X.shape[0]):
        best_candidates = neighbors[i][distances[i] != 1]
        best_distances = distances[i][distances[i] != 1]
        if best_candidates.size != 0:
            # get the place in candidate_playlists with the lowest distance
            candidate_idx = indices[best_candidates]
        else:
            # TODO: Change
            predictions_list.append(pd.DataFrame(columns = candidate_playlists.columns))
            continue
            
        candidates_df = candidate_playlists.loc[list(map(tuple, candidate_idx))]
        
        # transform the best_distances into a form that can be concatenated
        lengths = candidates_df.groupby(level=[0,1])["pos"].count().values
        

        candidates_df["distance"] = np.repeat(best_distances, lengths)
        
        # want only the songs that are not in the playlist
        not_in_train_df = candidates_df.iloc[np.where(np.in1d(candidates_df["track_name"].values, 
                                                      col_names[X[i].toarray()[0] != 0], invert=True))[0]]
        
        # drop duplicates
        no_duplicates_df = not_in_train_df.drop_duplicates("track_name")
        if no_duplicates_df.shape[0] < top_n:
            predictions_list.append(no_duplicates_df)
        else:
            predictions_list.append(no_duplicates_df.iloc[0:top_n])
    return predictions_list

In [13]:
predictions_list = getSongPredictions(test_df, model_knn, train_indices, complete_playlists, 500, col_names)

## Evaluate Predictions

In [14]:
test_indices

array([[167,  82],
       [172, 599],
       [897, 643],
       [447, 778],
       [865, 866]])

In [15]:
clicks = []
for i in range(test_df.shape[0]):
    predictions = predictions_list[i]
    idx = test_indices[i]
    val_songs = val_df.loc[tuple(idx)]
    positions_matching = np.where(np.in1d(predictions["track_name"].values, val_songs['track_name']))[0]
    # CLICKS: min pos / 10
    if positions_matching.size == 0:
        clicks.append(50)
    else:
        clicks.append(math.ceil(positions_matching[0] / 10))

  """


In [16]:
clicks

[10, 5, 50, 1, 50]

In [17]:
distances, neighbors = model_knn.kneighbors(test_df)

In [18]:
np.mean(clicks)

23.2

In [19]:
val_songs["track_name"]

fid  pid
865  866                 BTSTU (Edit)
     866                 Breezeblocks
     866                 Paper Planes
     866                    I Met You
     866                   Lofticries
     866    Attak (feat. Danny Brown)
     866                  Message Man
     866           The Devil Is A Lie
     866         X Gon' Give It To Ya
     866                   Devil Eyes
     866               No Role Modelz
     866                     Sabotage
     866                    One Night
     866              Congratulations
     866                           17
     866                      No Type
     866     All Along the Watchtower
     866                  Purple Haze
     866                    Rivertown
Name: track_name, dtype: object

In [20]:
predictions["track_name"][positions_matching]

Series([], Name: track_name, dtype: object)