In [1]:
import os
import sys
import glob
import time
import datetime
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import random
import scipy as sp

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from functools import partial
from multiprocessing.pool import Pool
from time import time

# Read in CSVs | Add FID | Concatenate

In [2]:
file_zero = pd.read_csv("Songs/songs0.csv")
file_zero["fid"] = 0
file_one = pd.read_csv("Songs/songs1.csv")
file_one["fid"] = 1
complete_playlists = pd.DataFrame(np.concatenate([file_zero, file_one]), columns = file_zero.columns)

# Set index to ["fid", "pid"]

In [4]:
complete_playlists.set_index(["fid", "pid"], inplace=True)
complete_playlists.sort_index(inplace = True) # Sort index to make indexing faster later

# Get all unique songs, artists, and albums for Collaborative Filtering

In [6]:
all_unique_songs = complete_playlists['track_name'].unique()
all_unique_albums = complete_playlists['album_name'].unique()
all_unique_artists = complete_playlists['artist_name'].unique()

# Make unique playlist ID's and split into train and test

In [7]:
unique_playlist_ids = np.unique(complete_playlists.index.values)
np.random.shuffle(unique_playlist_ids)

# 0.8 Test and 0.2 Train Split
train_idx_cutoff = int(0.8 * unique_playlist_ids.shape[0])
train_playlist_ids = unique_playlist_ids[0:train_idx_cutoff]
test_playlist_ids = unique_playlist_ids[train_idx_cutoff:unique_playlist_ids.shape[0]]

# Create Sparse Matrix

In [8]:
# Create sparse matrix based on passed in filter methods
def createSparseMatrix(train_playlist_ids, all_candidates_filter, filter_method = "track_name"):    
    rows_list = []
    for ids in train_playlist_ids:
        playlist_filter = complete_playlists.loc[ids][filter_method]
        ones_idx = np.where(np.isin(all_candidates_filter, playlist_filter))[0]
        row = np.zeros(all_candidates_filter.shape)
        row[ones_idx] = 1 / playlist_filter.shape[0]
        rows_list.append(row)
    train_sparse = sp.sparse.csr_matrix(np.vstack(rows_list))
    return train_sparse

# kNN with Collaborative Filtering on Songs

In [9]:
song_sm = createSparseMatrix(train_playlist_ids, all_unique_songs, "track_name")
model_knn_songs = NearestNeighbors(metric='cosine', 
                                        algorithm='brute', 
                                        n_neighbors=20, n_jobs=-1).fit(song_sm)

# kNN with Collaborative Filtering on Artists

In [62]:
artist_sm = createSparseMatrix(train_playlist_ids, all_unique_artists, "artist_name")
model_knn_artists = NearestNeighbors(metric='cosine', 
                                        algorithm='brute', 
                                        n_neighbors=20, n_jobs=-1).fit(artist_sm)

# kNN with Collaborative Filtering on Albums

In [11]:
album_sm = createSparseMatrix(train_playlist_ids, all_unique_albums, "album_name")
model_knn_albums = NearestNeighbors(metric='cosine', 
                                        algorithm='brute', 
                                        n_neighbors=20, n_jobs=-1).fit(album_sm)

# Create Sparse Vector for Test Playlists

In [46]:
def createSparseVector(test_playlist, all_candidates_filter, filter_method = "track_name"):
    # Takes a playlist and creates a sparse vector
    test_playlist_filter = test_playlist[filter_method]
    ones_idx = np.where(np.isin(all_candidates_filter, test_playlist_filter))
    row = np.zeros(all_candidates_filter.shape)
    row[ones_idx] = 1
    return row

# Split Test Set into Train and Validation

test_playlist_ids: These are the ids for the playlists that we want to make predictions on
- We split each test playlist into a "train" and "validation" set. 
- We create sparse matrices from the train part, to create a similarity value for the test playlist to existing playlists in our database. 3 are created for our 3 collaborative filtering techniques
- Then, we use the similarity value to choose playlists that are most similar and recommend the songs from that playlist

In [140]:
random.seed(109)
# Sparse matrices for the three collaborative filtering techniques
song_sm_list = []
album_sm_list = []
artist_sm_list = []
# Creating a list of validation songs that we later test against
val_list = []

# For each playlist in test playlists
for ids in test_playlist_ids:
    test_playlist = complete_playlists.loc[ids]
    
    # Split into test songs and validation songs
    idx = np.random.choice(np.arange(len(test_playlist)), int(0.8 * len(test_playlist)), replace=False)
    test_idx = np.where(np.isin(np.arange(len(test_playlist)), idx, invert=True))[0]
    predictor_playlist = test_playlist.iloc[idx]
    evaluation_playlist = test_playlist.iloc[test_idx]
    val_list.append(evaluation_playlist)
    
    # Make the sparse vector for each test playlist - cf on songs
    sparse_song_row = createSparseVector(predictor_playlist, all_unique_songs, "track_name")
    song_sm_list.append(sparse_song_row)
    
    # Make the sparse vector for each test playlist - cf on album
    sparse_album_row = createSparseVector(predictor_playlist, all_unique_albums, "album_name")
    album_sm_list.append(sparse_album_row)
    
    # Make the sparse vector for each test playlist - cf on artist
    sparse_artist_row = createSparseVector(predictor_playlist, all_unique_artists, "artist_name")
    artist_sm_list.append(sparse_artist_row)

# Compile all vectors for test playlists into a sparse matrix
song_sm = sp.sparse.csr_matrix(np.vstack(song_sm_list))
album_sm = sp.sparse.csr_matrix(np.vstack(album_sm_list))
artist_sm = sp.sparse.csr_matrix(np.vstack(artist_sm_list))

# Store the validation sets of each playlist
val_df = pd.concat(val_list)
test_indices = test_playlist_ids

# Prediction Function

In [133]:
def getSongPredictions(X, model, train_indices, candidate_playlists, top_n, test_indices):
    distances, neighbors = model.kneighbors(X)
    predictions_list = []
    # For each playlist in the sparse matrix passed in
    for i in range(test_indices.shape[0]):
        best_candidates = neighbors[i][distances[i] != 1]
        best_distances = distances[i][distances[i] != 1]
        if best_candidates.size != 0:
            # get the IDs in candidate_playlists [our database of playlists] with the lowest distance
            candidate_idx = train_indices[best_candidates]
        else:
            # TODO: Change
            predictions_list.append(pd.DataFrame(columns = candidate_playlists.columns))
            continue
            
        candidates_df = candidate_playlists.loc[candidate_idx]
        
        # transform the best_distances into a form that can be concatenated
        lengths = candidates_df.groupby(level=[0,1])["pos"].count().values
        candidates_df["distance"] = np.repeat(best_distances, lengths)
        
        # want only the songs that are not in the test playlist
        idx_songs_in_playlist = np.where(np.isin(candidates_df["track_name"].values,
                                                 complete_playlists.loc[test_indices[i]]["track_name"].values))[0]
        not_in_train_df = candidates_df.iloc[idx_songs_in_playlist]
        
        # drop duplicates from final recommendation list
        no_duplicates_df = not_in_train_df.drop_duplicates("track_name")
        if no_duplicates_df.shape[0] < top_n:
            predictions_list.append(no_duplicates_df)
        else:
            predictions_list.append(no_duplicates_df.iloc[0:top_n])
    return predictions_list

# Prediction with CF on Songs

In [141]:
predictions_cf_song = getSongPredictions(song_sm, model_knn_songs, 
                                         train_playlist_ids, complete_playlists, 
                                         500, test_playlist_ids)

# Prediction with CF on Artists

In [142]:
predictions_cf_artist = getSongPredictions(artist_sm, model_knn_artists, 
                                           train_playlist_ids, complete_playlists, 
                                           500, test_playlist_ids)

# Prediction with CF on Album

In [143]:
predictions_cf_album = getSongPredictions(album_sm, model_knn_albums, 
                                          train_playlist_ids, complete_playlists, 
                                          500, test_playlist_ids)

# Evaluate CF on Songs

In [105]:
def clickEvaluation(validation_playlist_df, predictions_df, test_indices):
    clicks = []
    validation_playlist_df.sort_index(inplace = True)
    for i in range(test_indices.shape[0]):
        predictions = predictions_df[i]
        idx = test_indices[i]
        val_playlist = validation_playlist_df.loc[idx]
        positions_matching = np.where(np.isin(predictions["track_name"].values, val_playlist['track_name']))[0]
        # CLICKS: min pos / 10
        if positions_matching.size == 0:
            clicks.append(50)
        else:
            clicks.append(math.ceil(positions_matching[0] / 10))
    return np.mean(clicks)

In [144]:
clickEvaluation(val_df, predictions_cf_song, test_indices)

6.9225

# Evaluate CF on Artists

In [145]:
clickEvaluation(val_df, predictions_cf_artist, test_indices)

6.1725

# Evaluate CF on Albums

In [146]:
clickEvaluation(val_df, predictions_cf_album, test_indices)

5.955