This notebook tries to answer our 3rd research question which is: 

`Can we predict an optimal list of artists for a group of arbitrary users?`

## Imports and Data

In [1]:
import pickle
import scipy.sparse
import random
import collections

import pandas as pd
import numpy as np

from helper_functions import *
from annoy import AnnoyIndex

In [23]:
# Load dictionnaries mapping user/artists with their index
user2id = load_pickle('../data/user2id.pickle')
id2user = load_pickle('../data/id2user.pickle')
artist2id = load_pickle('../data/artist2id.pickle')
id2artist = load_pickle('../data/id2artist.pickle')

# Load the matrix of plays
matrix_plays = scipy.sparse.load_npz('../data/matrix_plays.npz')

# Load the embeddings
user_embedding = load_pickle('../data/user_embeddings/pca/user_embedding_transformed_pca25.pickle')
artist_embedding = load_pickle('../data/artist_embeddings/pca/artist_embedding_transformed_pca25.pickle')

# Cluster labels
artist_cluster_labels = np.array([x for x in range(40000)]) # Need np.array format
user_cluster_labels = np.array(load_pickle('../data/clustering/hdbscan_user_labels.pickle')) # Need np.array format

# The group of users on which we want to predict the list of artists
users_group_id = range(10)



## Methods

In [16]:
'''
Use KNN to retrieve the artists
PARAMETERS:
    - artist_embedding: the artist embedding
    - artist_ref: the artist on which we want to compute knn on
    - k: the number of artists to retrieve
RETURN:
    - k_nearest_neighbor: the set of artists close to artist_ref
'''

def retrieve_artists_by_using_knn(artist_embedding, artist_ref, k):
    annoy_index = get_annoy_index(artist_embedding, range(len(artist_embedding)))
    k_nearest_neighbor = annoy_index.get_nns_by_item(artist_ref, k)
    return k_nearest_neighbor
    
    
'''
Retrieve the artists by selecting artists belonging to a cluster
PARAMETERS:
    - artist_cluster_labels: 
    - k: the number of artists to retrieve
    - artist_ref: if not None -> the reference artist representing the group of users
    - cluster_ref: if not None -> the reference cluster representing the group of users
RETURN:
    -  the set of artists close to artist_ref
'''
def retrieve_artists_by_picking_in_cluster(artist_cluster_labels, k, artist_ref = None, cluster_ref = None):
    if artist_ref is not None:
        artists_in_same_cluster = np.where(artist_cluster_labels==artist_cluster_labels[artist_ref])[0]
    elif cluster_value is not None:
        artists_in_same_cluster = np.where(artist_cluster_labels==cluster_value)[0]
        
    if k >= len(artists_in_same_cluster):
            return artists_in_same_cluster
    else:
        return random.sample(list(artists_in_same_cluster), k)

In [4]:
'''
Build the Annoy index in order to compute afterwords the nearest neighbors
PARAMETERS:
    - X: array of shape (number of samples, number of features)
    - items_id: list of id refencing each artist
    - extra_item: extra artists if needed
    - last_item_id: id of the extra added artist
RETURN:
    - The annoy index
'''
def get_annoy_index(X, items_id, extra_item = None, last_item_id = None):
    nb_trees = 100
    index = AnnoyIndex(X.shape[1], "euclidean")
    for i in range(X.shape[0]):
        index.add_item(items_id[i], X[i])
    if extra_item is not None:
        index.add_item(last_item_id, extra_item)
    index.build(nb_trees) 
    return index

### Retrieve artists from single cluster

#### Mean user vector

In [13]:
'''
Mean user vector method. 
- Compute the mean user vector
- Select the nearest neighbor of the user mean -> user_ref
- Select the artist with the max number of plays user_ref listen to
- Compute knn OR randomly retrieve artist that belong to the cluster 

PARAMETERS:
    - user_embedding: the user embedding
    - artist_embedding: the artist embedding
    - artist_cluster_labels: the cluster labels of all the data sample appearing in the embedding
    - users_group_id: index (id) representing our reference group of users
    - matrix_plays: matrix of shape (artists, users) representing the number of plays.
    - k: the number of artists to retrieve
RETURN:
    - artists_knn: selects artists by using the KNN method
    - artists_cluster: selects artists by randomly picking artists inside the cluster
'''
def compute_by_mean(user_embedding, artist_embedding, artist_cluster_labels, users_group_id, matrix_plays, k):
    idx_mean_user = 100000
    users_group_embedding = user_embedding[users_group_id]
    mean_user = users_group_embedding.mean(axis=0)
    annoy_index = get_annoy_index(users_group_embedding, users_group_id, mean_user, idx_mean_user)
    nearest_neighbor = annoy_index.get_nns_by_item(idx_mean_user, 2)[1]
    artist_idx = matrix_plays[:, nearest_neighbor].argmax()
    
    artists_knn = retrieve_artists_by_using_knn(artist_embedding, artist_idx, k)
    artists_cluster = retrieve_artists_by_picking_in_cluster(artist_cluster_labels, k, artist_ref=artist_idx)
    return artists_knn, artists_cluster

In [14]:
nb_selected_artists = 10
artists_knn, artists_cluster = compute_by_mean(user_embedding, artist_embedding, artist_cluster_labels, users_group_id, matrix_plays, nb_selected_artists)


#### Majority vote

In [8]:
'''
Majority vote method. 
- Look at the set of artists played by these users: select the artist cluster having the most number of artist.
- Compute knn OR randomly retrieve artist that belong to the cluster 

PARAMETERS:
    - user_embedding: the user embedding
    - artist_embedding: the artist embedding
    - artist_cluster_labels: the cluster labels of all the data sample appearing in the embedding
    - users_group_id: index (id) representing our reference group of users
    - matrix_plays: matrix of shape (artists, users) representing the number of plays.
    - k: the number of artists to retrieve
RETURN:
    - artists_knn: selects artists by using the KNN method
    - artists_cluster: selects artists by randomly picking artists inside the cluster
'''
def compute_by_majority_vote(users_group_id, matrix_plays, artist_cluster_labels):
    artist2occurences = collections.Counter(matrix_plays[:, users_group_id].nonzero()[0])
    cluster2occurences = {}
    for cluster_label in set(artist_cluster_labels):
        # Initialize the nuumber of occurences to 0
        cluster2occurences[cluster_label] = 0
        
        selected_indices = np.where(artist_cluster_labels==cluster_label)[0]
        set_artist = set(artist2occurences.keys())
        for artist_idx in selected_indices:
            if artist_idx in set_artist:
                cluster2occurences[cluster_label] += artist2occurences[artist_idx]
    return max(cluster2occurences, key=cluster2occurences.get)

In [17]:
nb_selected_artists = 10
cluster_value = compute_by_majority_vote(users_group_id, matrix_plays, artist_cluster_labels)

artists_cluster = retrieve_artists_by_picking_in_cluster(artist_cluster_labels, nb_selected_artists, cluster_ref=cluster_value)

### Retrieve artists from multiple clusters

#### Mean user vector per cluster

- Select the clusters of users these users belong to
- For each user cluster: 
    - compute the mean user vector
    - select the nearest neighbor of the user mean
    - select the artist with the max number of plays
    - compute knn OR randomly retrieve artist of belong to the cluster 
    

In [25]:
cluster_labels = user_cluster_labels[users_group_id]
nb_clusters = len(set(cluster_labels))
nb_selected_artists = 10

In [26]:
selected_artists_knn = []
selected_artists_clusters = []
nb_artists_per_cluster = nb_selected_artists//nb_clusters # Number of artists to select for each cluster
if nb_artists_per_cluster == 0:
    nb_artists_per_cluster = 1

for i, label in enumerate(set(cluster_labels)):
    selected_indices = np.where(cluster_labels==label)[0]
    artists_knn, artists_cluster = compute_by_mean(user_embedding, artist_embedding, artist_cluster_labels, users_group_id, matrix_plays, nb_selected_artists)
    selected_artists_knn.append(artists_knn)
    selected_artists_clusters.append(artists_cluster)