In [1]:
## IMPORTS ##
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.neighbors import NearestNeighbors
from collections import Counter

In [2]:
## DATA PROCESSING ##

# read in user data
user_df = pd.read_csv('filtered_user_df.csv')

# drop unnecessary columns
user_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

# filter out players with less than 125k total play counts
user_totalplay = user_df.groupby('user')['play_count'].sum().reset_index(name='total_play_count') 
filtered_users = user_totalplay[user_totalplay['total_play_count'] >= 125000]['user']
filtered_df = user_df[user_df['user'].isin(filtered_users)]

# display data
filtered_df.head(5)

Unnamed: 0,user,artist_name,play_count,artist_url
1259,ext_beck,Linkin Park,51318,https://www.last.fm/music/Linkin+Park
1260,ext_beck,Coldplay,21753,https://www.last.fm/music/Coldplay
1261,ext_beck,Moby,17404,https://www.last.fm/music/Moby
1262,ext_beck,Avril Lavigne,11340,https://www.last.fm/music/Avril+Lavigne
1263,ext_beck,Muse,8118,https://www.last.fm/music/Muse


In [3]:
## FILTERED SPARSE MATRIX ##
# B/C IDK IF THE ONE JOSH MADE FILTERED OUT BY TOTAL PLAY COUNT

# convert to sparse matrix so it can actually be stored on disk
user_ids = filtered_df['user'].unique()
artist_names = filtered_df['artist_name'].unique()

user_to_index = {user: i for i, user in enumerate(user_ids)}
artist_to_index = {artist: j for j, artist in enumerate(artist_names)}

index_to_user = {i: user for user, i in user_to_index.items()}
index_to_artist = {j: artist for artist, j in artist_to_index.items()}

# convert DataFrame rows to (row index, column index, value) tuples
rows = filtered_df['user'].map(user_to_index)
cols = filtered_df['artist_name'].map(artist_to_index)
values = filtered_df['play_count']

sparse_matrix = sparse.coo_matrix((values, (rows, cols)), shape=(len(user_ids), len(artist_names)))

# create sparse matrix of data and save into a file
sparse.save_npz('baseline-user-data.npz', sparse_matrix)

In [4]:
## SPARSE MATRIX PROVIDED BY JOSH ##

# load baseline user data
# baseline_data = np.load('baseline_user_data.npz')

# check available data
# print(baseline_data.files)

# access data
# user_row = baseline_data['row']
# print(f'row:\n{user_row}\n')

# user_col = baseline_data['col']
# print(f'col:\n{user_col}\n')

# user_format = baseline_data['format']
# print(f'format:\n{user_format}\n')

# user_shape = baseline_data['shape']
# print(f'shape:\n{user_shape}\n')

# user_data = baseline_data['data']
# print(f'data:\n{user_data}\n')

In [5]:
# KNN (USER-BASED FILTERING) ##

# initialize model using cosine as our similarity metric 
# it's suitable for high-dimensional sparse matrices
# b/c it focuses on the vectors direction rather than vector length

k = 20 # set k to be a decently large number - not sure what's good
model = NearestNeighbors(metric='cosine', n_neighbors=k, n_jobs=-1)

# fit model with sparse matrix
model.fit(sparse_matrix)

# compute nearest neighbors for each user
distances, indices = model.kneighbors(sparse_matrix)

In [6]:
## EXAMPLE ##

# recommending artists for user 0
user_id = 0
similar_users = indices[user_id]
similar_distances = distances[user_id]

# set counter for artists to keep track of how many times each artist is encountered across all similar users
artist_counter = Counter()

# find items recommended by similar users but not already interacted with by user 0
recommended_items = set()
for neighbor in similar_users[1:]:  # skip the first one because it's the user itself
    recommended_items = recommended_items.union(set(sparse_matrix.getrow(neighbor).nonzero()[1]))

current_items = set(sparse_matrix.getrow(user_id).nonzero()[1])
recommended_items = recommended_items.difference(current_items)

# print(f'Recommended items for user {user_id}:\n{recommended_items}\n')

# recommending top 5 most common artists
for neighbor in similar_users:
    # get all artists this neighbor has interacted with
    neighbor_artists = sparse_matrix.getrow(neighbor).nonzero()[1]
    # update counter
    for artist_index in neighbor_artists:
        if artist_index not in current_items: # don't count artists user already knows
            artist_counter[artist_index] += 1

most_common_artists = artist_counter.most_common(5)
recommended_items = [artist_index for artist_index, _ in most_common_artists]
            
# convert items back to artist names for readability
recommended_artist_names = [index_to_artist[index] for index in recommended_items]

print(f'Top 5 Recommended artists for User {user_id}:\n{recommended_artist_names}')

Top 5 Recommended artists for User 0:
['Slipknot', 'Red Hot Chili Peppers', 'Bring Me the Horizon', 'System of a Down', 'Limp Bizkit']


In [7]:
## EVALUATION ##
# skip for now - not sure how to go about this

In [8]:
## HYPERPARAMETER TUNING ##
# skip for now - not sure how to go about this