In [46]:
import pickle
import glob
import os

import pandas as pd
import numpy as np

from helper_functions import *
from annoy import AnnoyIndex
from scipy.spatial import distance

# Compute tuples or artists and users

## Compute random user jumper and user jumper for the artist embedding

In [115]:
NB_SAMPLE = 3000

In [116]:
'''
Random user jumper: Selects pair of random artists.
'''

selected_artists = np.random.choice(np.arange(S.shape[0]), size = (NB_SAMPLE, 2), replace=False)
artists_tuples = []
for tup in selected_artists:
    artists_tuples.append((tup[0], tup[1]))
save_to_pickle(artists_tuples, 'random_user_jumper_tuples', '../data/artist_embeddings/')

In [117]:
'''
User jumper: Select NB_SAMPLE users from the set of users and for each user selected, we select 2 artists
'''

selected_users = list(set(np.random.randint(S.shape[1] - 1, size=int(2*NB_SAMPLE))))

S_filter = S[:, selected_users]

# Create and store artists tuples
artists_tuples = []
for i in range(S_filter.shape[1]):
    idx = S_filter[:, i].nonzero()[0]

    if len(artists_tuples) >= NB_SAMPLE:
        break
    
save_to_pickle(artists_tuples, 'user_jumper_tuples', '../data/artist_embeddings/')

## Compute random artist jumper and artist jumper for the user embedding

In [118]:
'''
Random artist jumper: Selects pair of random users.
'''
S2 = S.T
selected_users = np.random.choice(np.arange(S2.shape[0]), size = (NB_SAMPLE, 2), replace=False)
users_tuples = []
for tup in selected_users:
    users_tuples.append((tup[0], tup[1]))
save_to_pickle(users_tuples, 'random_artist_jumper_tuples', '../data/user_embeddings/')

In [119]:
'''
Artist jumper: Select NB_SAMPLE artists from the set of users and for each artist selected, we select 2 users.
'''

selected_artists = list(set(np.random.randint(S2.shape[1] - 1, size=int(2*NB_SAMPLE))))

S2_filter = S2[:, selected_artists]

# Create and store artists tuples
users_tuples = []
for i in range(S2_filter.shape[1]):
    idx = S2_filter[:, i].nonzero()[0]
    if len(idx) >= 2:
        selected_users = np.random.choice(idx, 2, replace=False)
        users_tuples.append((selected_users[0], selected_users[1]))
    if len(users_tuples) >= NB_SAMPLE:
        break
    
save_to_pickle(users_tuples, 'artist_jumper_tuples', '../data/user_embeddings/')

# Compute embedding metrics

In [58]:
'''
Create a Pandas DataFrame from the embedding
PARAMETERS:
    - model_path: Path where the embedding is stoed
RETURN:
    - df: Pandas DataFrame
'''
def get_dataframe_in_embedding_space(model_path):
    graph_matrix = load_pickle(model_path)
    df = pd.DataFrame(graph_matrix)
    df = df.rename(lambda x: 'd_'+ str(x), axis='columns')
    return df

'''
Build the Annoy index in order to compute afterwords the nearest neighbors
PARAMETERS:
    - df: DataFrame representing the embedding space
RETURN:
    - The annoy index
'''
def get_annoy_index(df):
    nb_trees = 100
    index = AnnoyIndex(df.shape[1], "euclidean")  # Length of item vector that will be indexed
    df.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
    index.build(nb_trees) # 100 trees
    return index


'''
Get the random walk distance computed from the random_walk_channels pairs. The random walk distance is the sum of the euclidean distance between the pairs of random_walk_channels.
PARAMETERS:
    - df_embedding: DataFrame representing the channels in the embedding space
    - embedding_type: Our project contains 2 embeddings, a user embedding and a artist embedding
RETURN:
    - random_walk_distance: The distance of the random walk
'''
def get_random_walk(df_embedding, embedding_type, path):
    if embedding_type == 'users':
        random_walk_channels = load_pickle(path + 'random_artist_jumper_tuples.pickle')
    elif embedding_type == 'artists':
        random_walk_channels = load_pickle(path + 'random_user_jumper_tuples.pickle')
        
    random_walk_distance = 0
    for val in random_walk_channels:
        random_walk_distance += distance.euclidean(df_embedding.iloc[val[0]], df_embedding.iloc[val[1]])
    return random_walk_distance



'''
Get the position of ref_channel relative to second_channel in terms of its nearest neighbors ranking.
PARAMETER:
    - ref_channel: The reference channel on which we compute the k nearest neighbors
    - second_channel: The channel on which we compute the ranking relatively to ref_channel
    - index: Annoy index to compute the neirest neighbors
    - df_embedding: DataFrame representing the channels in the embedding space
RETURN: 
    - The position of second_channel relatively to ref_channel in terms of it's ranking
'''
def get_ranking_position_between_channels(ref_channel, second_channel, index, df_embedding):
    nearest_neighbors_index = index.get_nns_by_item(ref_channel, len(df_embedding))
    dist_k_th_nearest = distance.euclidean(df_embedding.iloc[ref_channel], 
                                           df_embedding.iloc[nearest_neighbors_index[len(nearest_neighbors_index)-1]])
    for i in range(0, len(nearest_neighbors_index)):
        if nearest_neighbors_index[i] == second_channel:
            return i
        
        
        
        
'''
Compute the Jumper ratio as well as the position ratio.
PARAMETER:
    - files: List of path where are stored the embedding
    - channels_tuple: The pairs of channels corresponding to the user jumper channel pairs
RETURN: 
    - user_jumper_tab: List of Jumper ratio corresponding the embedding contained in files
    - ranking_position_tab: List of Position ratio corresponding the embedding contained in files
'''
def get_user_walk_and_position_ratio(files, embedding_type, path):
    
    if embedding_type == 'users':
        channels_tuple = load_pickle(path + 'artist_jumper_tuples.pickle')
    elif embedding_type == 'artists':
         channels_tuple = load_pickle(path + 'user_jumper_tuples.pickle')
    
    len_random_set = len(channels_tuple)
    
    user_jumper_tab = []
    ranking_position_tab = []

    for file in files:     
        print('file ', file)                  
        df_embedding = get_dataframe_in_embedding_space(file)
        n_comp = df_embedding.shape[1]
        print('n_comp ', n_comp)
        random_walk_distance = get_random_walk(df_embedding, embedding_type, path)
        index = get_annoy_index(df_embedding)
        users_walk = 0
        ranking_position = 0
        
        temp_user_jumper_tab = []
        temp_ranking_position_tab = []
        
        for ind, channel in enumerate(channels_tuple):
            ref_channel = channel[0]
            second_channel = channel[1]
            # For every pair sum the users_walk and the ranking_position results
            temp_user_jumper_tab.append(distance.euclidean(df_embedding.iloc[ref_channel], df_embedding.iloc[second_channel])/random_walk_distance)
            temp_ranking_position_tab.append(get_ranking_position_between_channels(ref_channel, second_channel, index, df_embedding)/(df_embedding.shape[0]*len_random_set))

        user_jumper_tab.append(temp_user_jumper_tab)
        ranking_position_tab.append(temp_ranking_position_tab)
        
    save_to_pickle(user_jumper_tab, embedding_type + '_jumper_results', path + 'embedding_reduced/')
    save_to_pickle(ranking_position_tab, embedding_type + '_ranking_position_results', path + 'embedding_reduced/')
        
    return user_jumper_tab, ranking_position_tab
    

In [59]:
if os.path.exists("../data/user_embeddings/embedding_reduced/users_jumper_results.pickle"):
    os.remove("../data/user_embeddings/embedding_reduced/users_jumper_results.pickle")
    
if os.path.exists("../data/user_embeddings/embedding_reduced/users_ranking_position_results.pickle"):
    os.remove("../data/user_embeddings/embedding_reduced/users_ranking_position_results.pickle")

    
user_files = glob.glob("../data/user_embeddings/embedding_reduced/*.pickle")
user_jumper_tab, user_ranking_position_tab = get_user_walk_and_position_ratio(user_files, 'users', '../data/user_embeddings/')

file  ../data/user_embeddings/embedding_reduced/user_embedding_norm_transformed_pca100.pickle
n_comp  100
file  ../data/user_embeddings/embedding_reduced/user_embedding_norm_transformed_pca25.pickle
n_comp  25
file  ../data/user_embeddings/embedding_reduced/user_embedding_norm_transformed_pca50.pickle
n_comp  50
file  ../data/user_embeddings/embedding_reduced/user_embedding_transformed_pca100.pickle
n_comp  100
file  ../data/user_embeddings/embedding_reduced/user_embedding_transformed_pca25.pickle
n_comp  25
file  ../data/user_embeddings/embedding_reduced/user_embedding_transformed_pca50.pickle
n_comp  50


In [60]:
if os.path.exists("../data/user_embeddings/embedding_reduced/artists_jumper_results.pickle"):
    os.remove("../data/user_embeddings/embedding_reduced/artists_jumper_results.pickle")
    
if os.path.exists("../data/user_embeddings/embedding_reduced/artists_ranking_position_results.pickle"):
    os.remove("../data/user_embeddings/embedding_reduced/artists_ranking_position_results.pickle")
    
artist_files = glob.glob("../data/artist_embeddings/embedding_reduced/*.pickle")
artist_jumper_tab, artist_ranking_position_tab = get_user_walk_and_position_ratio(artist_files, 'artists', '../data/artist_embeddings/')

file  ../data/artist_embeddings/embedding_reduced/artist_embedding_norm_transformed_pca100.pickle
n_comp  100
file  ../data/artist_embeddings/embedding_reduced/artist_embedding_norm_transformed_pca25.pickle
n_comp  25
file  ../data/artist_embeddings/embedding_reduced/artist_embedding_norm_transformed_pca50.pickle
n_comp  50
file  ../data/artist_embeddings/embedding_reduced/artist_embedding_transformed_pca100.pickle
n_comp  100
file  ../data/artist_embeddings/embedding_reduced/artist_embedding_transformed_pca25.pickle
n_comp  25
file  ../data/artist_embeddings/embedding_reduced/artist_embedding_transformed_pca50.pickle
n_comp  50


## User embedding

In [62]:
len(user_jumper_tab)

6

In [9]:
user_ranking_position_tab

[0.30599426539423935,
 0.26668201157182664,
 0.28762464972728063,
 0.267672144734661,
 0.21927256150986288,
 0.24153307363817914]

## Artist embedding

In [10]:
artist_jumper_tab

[0.27994425186460686,
 0.27103268519426194,
 0.27767645820374864,
 0.2772808688676532,
 0.26398939194861554,
 0.2744653738416886]

In [11]:
artist_ranking_position_tab

[0.1578612495751671,
 0.1806129017031079,
 0.16770055889128055,
 0.16007017295419357,
 0.1858899305162192,
 0.17114596314338582]

## Compute random user jumper and user jumper for the artist embedding

In [115]:
NB_SAMPLE = 3000

In [116]:
'''
Random user jumper: Selects pair of random artists.
'''

selected_artists = np.random.choice(np.arange(S.shape[0]), size = (NB_SAMPLE, 2), replace=False)
artists_tuples = []
for tup in selected_artists:
    artists_tuples.append((tup[0], tup[1]))
save_to_pickle(artists_tuples, 'random_user_jumper_tuples', '../data/artist_embeddings/')

In [117]:
'''
User jumper: Select NB_SAMPLE users from the set of users and for each user selected, we select 2 artists
'''

selected_users = list(set(np.random.randint(S.shape[1] - 1, size=int(2*NB_SAMPLE))))

S_filter = S[:, selected_users]

# Create and store artists tuples
artists_tuples = []
for i in range(S_filter.shape[1]):
    idx = S_filter[:, i].nonzero()[0]

    if len(artists_tuples) >= NB_SAMPLE:
        break
    
save_to_pickle(artists_tuples, 'user_jumper_tuples', '../data/artist_embeddings/')

## Compute random artist jumper and artist jumper for the user embedding

In [118]:
'''
Random artist jumper: Selects pair of random users.
'''
S2 = S.T
selected_users = np.random.choice(np.arange(S2.shape[0]), size = (NB_SAMPLE, 2), replace=False)
users_tuples = []
for tup in selected_users:
    users_tuples.append((tup[0], tup[1]))
save_to_pickle(users_tuples, 'random_artist_jumper_tuples', '../data/user_embeddings/')

In [119]:
'''
Artist jumper: Select NB_SAMPLE artists from the set of users and for each artist selected, we select 2 users.
'''

selected_artists = list(set(np.random.randint(S2.shape[1] - 1, size=int(2*NB_SAMPLE))))

S2_filter = S2[:, selected_artists]

# Create and store artists tuples
users_tuples = []
for i in range(S2_filter.shape[1]):
    idx = S2_filter[:, i].nonzero()[0]
    if len(idx) >= 2:
        selected_users = np.random.choice(idx, 2, replace=False)
        users_tuples.append((selected_users[0], selected_users[1]))
    if len(users_tuples) >= NB_SAMPLE:
        break
    
save_to_pickle(users_tuples, 'artist_jumper_tuples', '../data/user_embeddings/')