In [4]:
## IMPORTS ##
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.neighbors import NearestNeighbors

In [2]:
## DATA PROCESSING ##

# read in user data
user_df = pd.read_csv('filtered_user_df.csv')

# drop unnecessary columns
user_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

# filter out players with less than 125k total play counts
user_totalplay = user_df.groupby('user')['play_count'].sum().reset_index(name='total_play_count') 
filtered_users = user_totalplay[user_totalplay['total_play_count'] >= 125000]['user']
filtered_df = user_df[user_df['user'].isin(filtered_users)]

# display data
filtered_df.head(5)

Unnamed: 0,user,artist_name,play_count,artist_url
1259,ext_beck,Linkin Park,51318,https://www.last.fm/music/Linkin+Park
1260,ext_beck,Coldplay,21753,https://www.last.fm/music/Coldplay
1261,ext_beck,Moby,17404,https://www.last.fm/music/Moby
1262,ext_beck,Avril Lavigne,11340,https://www.last.fm/music/Avril+Lavigne
1263,ext_beck,Muse,8118,https://www.last.fm/music/Muse


In [7]:
## MAKING MY OWN FILTERED SPARSE MATRIX ##
# B/C IDK IF THE ONE JOSH MADE FILTERED OUT BY TOTAL PLAY COUNT

# convert to sparse matrix so it can actually be stored on disk
user_ids = filtered_df['user'].unique()
artist_names = filtered_df['artist_name'].unique()

user_to_index = {user: i for i, user in enumerate(user_ids)}
artist_to_index = {artist: j for j, artist in enumerate(artist_names)}

index_to_user = {i: user for user, i in user_to_index.items()}
index_to_artist = {j: artist for artist, j in artist_to_index.items()}

# convert DataFrame rows to (row index, column index, value) tuples
rows = filtered_df['user'].map(user_to_index)
cols = filtered_df['artist_name'].map(artist_to_index)
values = filtered_df['play_count']

sparse_matrix = sparse.coo_matrix((values, (rows, cols)), shape=(len(user_ids), len(artist_names)))

# create sparse matrix of data and save into a file
sparse.save_npz('baseline-user-data.npz', sparse_matrix)

In [None]:
## THIS IS THE SPARSE MATRIX JOSH PROVIDED ##

# load baseline user data
# baseline_data = np.load('baseline_user_data.npz')

# check available data
# print(baseline_data.files)

# access data
# user_row = baseline_data['row']
# print(f'row:\n{user_row}\n')

# user_col = baseline_data['col']
# print(f'col:\n{user_col}\n')

# user_format = baseline_data['format']
# print(f'format:\n{user_format}\n')

# user_shape = baseline_data['shape']
# print(f'shape:\n{user_shape}\n')

# user_data = baseline_data['data']
# print(f'data:\n{user_data}\n')

In [8]:
# KNN (USER-BASED FILTERING) ##

# initialize model using cosine as our similarity metric 
# it's suitable for high-dimensional sparse matrices
# it focuses on the vectors direction rather than vector length
k = 2
model = NearestNeighbors(metric='cosine', n_neighbors=k, n_jobs=-1)

# fit model with sparse matrix
model.fit(sparse_matrix)

# compute nearest neighbors for each user
distances, indices = model.kneighbors(sparse_matrix)

In [11]:
## EXAMPLE ##
# THE RECOMMENDATIONS ARE NOT IN ANY PARTICULAR ORDER 
# WE NEED TO DECIDE HOW MANY ARTISTS WE ARE RECOMMENDING (5?)
# IF SO WE NEED TO KEEP TRACK OF SIMILARITY SCORES

# recommending artists for user 0
user_id = 0
similar_users = indices[user_id]
similar_distances = distances[user_id]

# find items recommended by similar users but not already interacted with by user 0
recommended_items = set()
for neighbor in similar_users[1:]:  # Skip the first one because it's the user itself
    recommended_items = recommended_items.union(set(sparse_matrix.getrow(neighbor).nonzero()[1]))

current_items = set(sparse_matrix.getrow(user_id).nonzero()[1])
recommended_items = recommended_items.difference(current_items)

print(f'Recommended items for user {user_id}:\n{recommended_items}\n')

# convert items back to artist names for readability
recommended_artist_names = [index_to_artist[index] for index in recommended_items]

print(f'\nRecommended artists for user {user_id}:\n{recommended_artist_names}')

Recommended items for user 0:
{386, 776, 779, 1547, 1548, 410, 1562, 931, 552, 940, 1455, 559, 438, 1473, 1992, 201, 85, 9305, 9306, 9307, 9308, 350, 95, 357, 491, 364, 879, 113, 755, 116, 758, 122, 765}


Recommended artists for user 0:
['The Neighbourhood', 'My Chemical Romance', 'Panic! at the Disco', 'Good Charlotte', 'Avicii', 'Nickelback', 'OneRepublic', 'The Kooks', 'Kings of Leon', 'The All-American Rejects', 'Café Tacvba', 'Foster the People', 'The Strokes', 'Zoé', 'Hoobastank', 'Ellie Goulding', 'The xx', 'Jamie xx', 'Saybia', 'Nevada Tan', 'Café Tacuba', 'Oasis', 'Radiohead', 'Jimmy Eat World', 'Daft Punk', 'Red Hot Chili Peppers', 'Bring Me the Horizon', 'Arctic Monkeys', 'alt-J', 'Keane', 'M83', 'Lana Del Rey', 'Fall Out Boy']


In [None]:
## EVALUATION ##

In [None]:
## HYPERPARAMETER TUNING ##
# not sure how to go about this, what's my metric for optimization?