This notebook tries to answer our 3rd research question which is: 

`Can we predict an optimal list of artists for a group of arbitrary users?`

## Imports and Data

In [1]:
import pickle
import scipy.sparse
import random
import collections

import pandas as pd
import numpy as np

from helper_functions import *
from annoy import AnnoyIndex

In [2]:
# Load dictionnaries mapping user/artists with their index
user2id = load_pickle('../data/user2id.pickle')
id2user = load_pickle('../data/id2user.pickle')
artist2id = load_pickle('../data/artist2id.pickle')
id2artist = load_pickle('../data/id2artist.pickle')

# Load the matrix of plays
matrix_plays = scipy.sparse.load_npz('../data/matrix_plays.npz')

# Load the dataframe of plays to map the artist id ith it's name
col_names_consumption = ['user-mboxsha1', 'musicbrainz-artist-id', 'artist-name', 'plays']
df_plays = pd.read_csv('../data/usersha1-artmbid-artname-plays.tsv', sep='\t', names=col_names_consumption)
df_plays = df_plays.rename({'user-mboxsha1':'user_id', 'musicbrainz-artist-id':'artist_id', 'artist-name': 'artist_name'}, axis=1)
df_plays = df_plays[['artist_id', 'artist_name']]
df_plays = df_plays.groupby('artist_id').head(1)

# Load the embeddings
user_embedding = load_pickle('../data/user_embeddings/pca/user_embedding_transformed_pca25.pickle')
artist_embedding = load_pickle('../data/artist_embeddings/pca/artist_embedding_transformed_pca25.pickle')

# Cluster labels 
artist_cluster_labels = np.array(load_pickle('../data/clustering/gmm_artist_labels.pickle')) # Need np.array format
user_cluster_labels = np.array(load_pickle('../data/clustering/hdbscan_user_labels.pickle')) # Need np.array format


In [None]:
# The group of users on which we want to predict the list of artists
users_group_id = range(10)

## Methods

In [3]:
'''
Use KNN to retrieve the artists
PARAMETERS:
    - artist_embedding: the artist embedding
    - artist_ref: the artist on which we want to compute knn on
    - k: the number of artists to retrieve
RETURN:
    - k_nearest_neighbor: the set of artists close to artist_ref
'''

def retrieve_artists_by_using_knn(artist_embedding, artist_ref, k):
    annoy_index = get_annoy_index(artist_embedding, range(len(artist_embedding)))
    k_nearest_neighbor = annoy_index.get_nns_by_item(artist_ref, k)
    return k_nearest_neighbor
    
    
'''
Retrieve the artists by selecting artists belonging to a cluster
PARAMETERS:
    - artist_cluster_labels: 
    - k: the number of artists to retrieve
    - artist_ref: if not None -> the reference artist representing the group of users
    - cluster_ref: if not None -> the reference cluster representing the group of users
RETURN:
    -  the set of artists close to artist_ref
'''
def retrieve_artists_by_picking_in_cluster(artist_cluster_labels, k, artist_ref = None, cluster_ref = None):
    if artist_ref is not None:
        artists_in_same_cluster = np.where(artist_cluster_labels==artist_cluster_labels[artist_ref])[0]
    elif cluster_value is not None:
        artists_in_same_cluster = np.where(artist_cluster_labels==cluster_value)[0]
        
    if k >= len(artists_in_same_cluster):
            return artists_in_same_cluster
    else:
        return random.sample(list(artists_in_same_cluster), k)

In [4]:
'''
Build the Annoy index in order to compute afterwords the nearest neighbors
PARAMETERS:
    - X: array of shape (number of samples, number of features)
    - items_id: list of id refencing each artist
    - extra_item: extra artists if needed
    - last_item_id: id of the extra added artist
RETURN:
    - The annoy index
'''
def get_annoy_index(X, items_id, extra_item = None, last_item_id = None):
    nb_trees = 100
    index = AnnoyIndex(X.shape[1], "euclidean")
    for i in range(X.shape[0]):
        index.add_item(items_id[i], X[i])
    if extra_item is not None:
        index.add_item(last_item_id, extra_item)
    index.build(nb_trees) 
    return index

### Retrieve artists from single cluster

#### Mean user vector

In [29]:
'''
Mean user vector method. 
- Compute the mean user vector
- Select the nearest neighbor of the user mean -> user_ref
- Select the artist with the max number of plays user_ref listen to
- Compute knn OR randomly retrieve artist that belong to the cluster 

PARAMETERS:
    - user_embedding: the user embedding
    - artist_embedding: the artist embedding
    - artist_cluster_labels: the cluster labels of all the data sample appearing in the embedding
    - users_group_id: index (id) representing our reference group of users
    - matrix_plays: matrix of shape (artists, users) representing the number of plays.
    - k: the number of artists to retrieve
RETURN:
    - artists_knn: selects artists by using the KNN method
    - artists_cluster: selects artists by randomly picking artists inside the cluster
'''
def compute_by_mean(user_embedding, artist_embedding, artist_cluster_labels, users_group_id, matrix_plays, k):
    idx_mean_user = 100000
    users_group_embedding = user_embedding[users_group_id]
    mean_user = users_group_embedding.mean(axis=0)
    annoy_index = get_annoy_index(users_group_embedding, users_group_id, mean_user, idx_mean_user)
    nearest_neighbor = annoy_index.get_nns_by_item(idx_mean_user, 2)[1]
    if nearest_neighbor==100000:
        nearest_neighbor = annoy_index.get_nns_by_item(idx_mean_user, 2)[0]
    artist_idx = matrix_plays[:, nearest_neighbor].argmax()
    
    artists_knn = retrieve_artists_by_using_knn(artist_embedding, artist_idx, k)
    artists_cluster = retrieve_artists_by_picking_in_cluster(artist_cluster_labels, k, artist_ref=artist_idx)
    return artists_knn, artists_cluster

In [6]:
nb_selected_artists = 10
artists_knn, artists_cluster = compute_by_mean(user_embedding, artist_embedding, artist_cluster_labels, users_group_id, matrix_plays, nb_selected_artists)

In [7]:
df_plays[df_plays['artist_id'].isin([id2artist[artist] for artist in artists_cluster])]

Unnamed: 0,artist_id,artist_name
2200,d7e28bb6-366a-46d8-92a9-ae574375c75d,スキマスイッチ
16225,80a0ab61-c33d-4a59-a950-aaf3b69d3251,mark de clive-lowe
27295,ce7b75bb-3744-4af6-aa4a-65d9b5c92913,saez
38283,9f35bff0-a843-4303-a416-2c5303e4b080,debout sur le zinc
38670,38f9065e-38e5-410c-a7e7-734dad015f6e,ritam nereda
51290,f48c1319-315f-4d92-9596-29151c2cdf38,zero one
179325,f6258f8f-aaf8-4846-bf6b-f6e5e1ac5356,sbk
222463,a7c3f0e1-027a-4f1f-800a-c115422f96f3,pansy division
495581,295ba06a-d49d-4867-b15c-66ccef993f48,alex bugnon
514881,7c322e9e-9dc6-4e71-afd7-84f870510d25,electronic eric


In [8]:
df_plays[df_plays['artist_id'].isin([id2artist[artist] for artist in artists_knn])]

Unnamed: 0,artist_id,artist_name
2182,b128a994-2400-432d-b26a-8feede87daa8,do as infinity
8379,16b3f3fc-6c76-4fec-8f0b-9fa9a4ec8e91,ce ce peniston
10205,ec2bcb77-b9a1-49e2-bfe7-419586bbef48,big bang
36346,112c9045-67fa-4a5e-a183-1af05ead65fc,seamo
67902,aea213c9-5ba7-4160-a80e-84d9eed4a833,楊丞琳
67910,6e1b6de7-d392-461d-8e07-33fa21ac1588,ss501
84415,ff0ecbbe-34b0-49c7-a36d-81a47700ac13,孫燕姿
105068,ec93e078-ef9c-4f0b-aab0-6eec009443dc,シド
105993,26116544-8da6-4680-bea8-2d47d09ea781,school food punishment
635017,3a71842d-b291-4991-a30d-9d8328a26d3e,陳綺貞


#### Majority vote

In [9]:
'''
Majority vote method. 
- Look at the set of artists played by these users: select the artist cluster having the most number of artist.
- Compute knn OR randomly retrieve artist that belong to the cluster 

PARAMETERS:
    - user_embedding: the user embedding
    - artist_embedding: the artist embedding
    - artist_cluster_labels: the cluster labels of all the data sample appearing in the embedding
    - users_group_id: index (id) representing our reference group of users
    - matrix_plays: matrix of shape (artists, users) representing the number of plays.
    - k: the number of artists to retrieve
RETURN:
    - artists_knn: selects artists by using the KNN method
    - artists_cluster: selects artists by randomly picking artists inside the cluster
'''
def compute_by_majority_vote(users_group_id, matrix_plays, artist_cluster_labels):
    artist2occurences = collections.Counter(matrix_plays[:, users_group_id].nonzero()[0])
    cluster2occurences = {}
    for cluster_label in set(artist_cluster_labels):
        # Initialize the nuumber of occurences to 0
        cluster2occurences[cluster_label] = 0
        
        selected_indices = np.where(artist_cluster_labels==cluster_label)[0]
        set_artist = set(artist2occurences.keys())
        for artist_idx in selected_indices:
            if artist_idx in set_artist:
                cluster2occurences[cluster_label] += artist2occurences[artist_idx]
    return max(cluster2occurences, key=cluster2occurences.get)

In [10]:
nb_selected_artists = 10
cluster_value = compute_by_majority_vote(users_group_id, matrix_plays, artist_cluster_labels)

artists_cluster_maj = retrieve_artists_by_picking_in_cluster(artist_cluster_labels, nb_selected_artists, cluster_ref=cluster_value)

In [11]:
df_plays[df_plays['artist_id'].isin([id2artist[artist] for artist in artists_cluster_maj])]

Unnamed: 0,artist_id,artist_name
1570,3b8018b4-09e8-477c-b223-4097bd0fcc39,rubik
10497,e1283576-51be-498c-ae30-b0c1506e4bd8,los piratas
21050,3c829973-0eab-4387-b8d1-678fb476a2ec,dark day
54177,b36d22e9-7f2c-42ff-98a6-b0e3f2cb36f5,ciccone youth
63848,5b5dacaf-c490-41f8-9142-129fbdfaddc1,the stance brothers
146990,8788a0c8-649b-479b-888d-0efdbe01723f,plushgun
279846,edc682ae-a731-430a-9d3b-0145794d76a9,devendra banhart and jana hunter
366984,fba33875-f103-4f98-9e4a-dc6cfe09652b,hjaltalín
1084929,13b14a2e-470c-434c-8f89-c8763f849e2c,"tipton, glenn"
1265719,0856875c-4ee7-41db-8c10-248c3124148d,daniel wesley


### Retrieve artists from multiple clusters

#### Mean user vector per cluster

- Select the clusters of users these users belong to
- For each user cluster: 
    - compute the mean user vector
    - select the nearest neighbor of the user mean
    - select the artist with the max number of plays
    - compute knn OR randomly retrieve artist of belong to the cluster 
    

In [12]:
cluster_labels = user_cluster_labels[users_group_id]
nb_clusters = len(set(cluster_labels))
nb_selected_artists = 10

In [32]:
haha = [1, 2, 3, 4]

In [33]:
haha + [5, 5, 6, 7]

[1, 2, 3, 4, 5, 5, 6, 7]

In [34]:
selected_artists_knn = []
selected_artists_clusters = []
nb_artists_per_cluster = nb_selected_artists//nb_clusters # Number of artists to select for each cluster
if nb_artists_per_cluster == 0:
    nb_artists_per_cluster = 1
    
for i, label in enumerate(set(cluster_labels)):
    selected_indices = np.where(cluster_labels==label)[0]
    artists_knn, artists_cluster = compute_by_mean(user_embedding, artist_embedding, artist_cluster_labels, selected_indices, matrix_plays, nb_artists_per_cluster)
    selected_artists_knn += artists_knn
    selected_artists_clusters += artists_cluster

In [36]:
df_plays[df_plays['artist_id'].isin([id2artist[artist] for artist in selected_artists_knn])]

Unnamed: 0,artist_id,artist_name
629,4e024037-14b7-4aea-99ad-c6ace63b9620,madvillain
2208,b56d532a-16d7-4e4c-aab4-486941ffc959,柴田淳
5750,917bc621-ad02-477d-9308-a0304c5f9727,noah and the whale
202086,b67babbc-b126-4e6f-996d-d1b6162e16b1,allister brimble
542247,eb0d5aaf-bb24-4c14-a18f-02deade92a86,dreaminfusion
1455031,4ea14f44-0dc4-441b-b9bf-c92141db4c26,eevil stöö
2709706,22d20d75-eaa9-45f9-b9d6-cc1b6b835a95,radius system


In [37]:
df_plays[df_plays['artist_id'].isin([id2artist[artist] for artist in selected_artists_clusters])]

Unnamed: 0,artist_id,artist_name
34579,275d1fca-22e8-46b9-85e6-c3523098a599,paula abdul
157831,6082f427-055a-4b27-819f-06f17fd1bda9,passport
211925,8ed72443-bbad-4f97-95bf-9c57de31a94b,cheap wine
221324,1304e901-3593-4459-ad2f-34cdaa28729f,something happens
289561,386b69b4-e188-4f4f-8b2d-2a375fc6608d,kool shen
457703,c67f0ee7-7cd9-48f6-9eef-200b8e52aa2b,c.a. quintet
2193980,a51d4b6c-fe34-43a8-bf1c-cd6ddf2e7682,lerosa
