In [67]:
import os
import sys
import glob
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import random
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import scipy as sp

Outline:
* Want to create a training/test set to try CF on
* Create SciPy sparse matrix from unique songs
    - Rows: Playlists
    - Columns: Songs in a given playlist
* Filter + predict songs that are in the other playlists that have lowest distance

POSSIBLE CHANGE: If a song appears multiple times in a single playlist, use number of occurrences rather than just presence

In [2]:
total_songs = pd.read_csv("total_songs_clean.csv")

In [341]:
complete_playlists = pd.read_csv("complete_playlists_50.csv")

## Get train/test playlists

In [10]:
unique_playlist_ids = complete_playlists[["pid", "fid"]].groupby(['pid', "fid"]).first().reset_index()

In [114]:
train_playlist_ids = unique_playlist_ids.sample(45)

In [115]:
test_playlist_ids = unique_playlist_ids.loc[-(unique_playlist_ids["pid"].isin(train_playlist_ids["pid"]) & 
                                              unique_playlist_ids["fid"].isin(train_playlist_ids["fid"]))]

In [116]:
test_playlist_ids.head()

Unnamed: 0,pid,fid
6,190,113
12,314,508
17,453,341
40,851,320
47,979,892


## Create SciPy Spare Matrix

In [342]:
col_names = complete_playlists['track_name'].unique()
train_indicies = []
complete_playlists.set_index(["fid", "pid"], inplace=True)

In [343]:
rows_list = []
for fid, pid in zip(train_playlist_ids["fid"], train_playlist_ids["pid"]):
    train_indicies.append((fid, pid))
    playlist_songs = complete_playlists.loc[pd.Index([fid, pid]), "track_name"].values
    ones_idx = np.where(np.in1d(col_names, playlist_songs))[0]
    row = np.zeros(col_names.shape)
    row[ones_idx] = 1
    rows_list.append(row)
train_df = sp.sparse.csr_matrix(np.vstack(rows_list))
train_indicies = np.array(train_indicies)

## Use kNN with cosine distance to find nearest neighbors

In [344]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1).fit(train_df)

## Actually make predictions!

In [345]:
test_indicies = []
rows_list = []
val_list = []
for fid, pid in zip(test_playlist_ids["fid"], test_playlist_ids["pid"]):
    test_indicies.append((fid, pid))
    playlist_songs = complete_playlists.loc[pd.Index([fid, pid])]
    idx = np.random.choice(np.arange(len(playlist_songs)), int(0.8 * len(playlist_songs)), replace=False)
    test_idx = np.where(np.in1d(np.arange(len(playlist_songs)), idx, invert=True))[0]
    songs_to_use = playlist_songs["track_name"].values[idx]
    songs_to_evaluate = playlist_songs.iloc[test_idx]
    val_list.append(songs_to_evaluate)
    ones_idx = np.where(np.in1d(col_names, songs_to_use))[0]
    row = np.zeros(col_names.shape)
    row[ones_idx] = 1
    rows_list.append(row)
test_df = sp.sparse.csr_matrix(np.vstack(rows_list))
val_df = pd.concat(val_list)
test_indicies = np.array(test_indicies)

In [347]:
def getSongPredictions(X, model, test_indicies, candidate_playlists, top_n, col_names):
    distances, neighbors = model.kneighbors(X)
    predictions_list = []
    for i in range(X.shape[0]):
        best_candidates = neighbors[i][distances[i] != 1]
        best_distances = distances[i][distances[i] != 1]
        if best_candidates.size != 0:
            candidate_idx = test_indicies[best_candidates]
        else:
            predictions_list.append(np.array([]))
            continue
        candidates_df = candidate_playlists.loc[pd.Index(candidate_idx)].copy()
        
        # transform the best_distances into a form that can be concatenated
        lengths = candidates_df.groupby(level=[0,1])["pos"].count().values
        candidates_df["distance"] = np.repeat(best_distances, lengths)
        
        # want only the songs that are not in the playlist
        np.where(np.in1d(candidates_df["track_name"].values, 

In [348]:
X = test_df
distances, neighbors = model_knn.kneighbors(X)
predictions_list = []
for i in range(X.shape[0]):
    best_candidates = neighbors[i][distances[i] != 1]
    best_distances = distances[i][distances[i] != 1]
    if best_candidates.size != 0:
        candidate_idx = train_indicies[best_candidates]
        break
    else:
        predictions_list.append(np.array([]))
        continue   
    
df = complete_playlists.loc[pd.Index(candidate_idx)].copy()
lengths = df.groupby(level=[0,1])["pos"].count().values
df["distance"] = np.repeat(best_distances, lengths)


In [360]:
np.reshape(X[i], [X[i].shape[1],])

<1x3078 sparse matrix of type '<class 'numpy.float64'>'
	with 39 stored elements in Compressed Sparse Row format>