In [1]:
import pickle
import glob

import pandas as pd
import numpy as np

from helper_functions import *
from annoy import AnnoyIndex
from scipy.spatial import distance

In [12]:
'''
Create a Pandas DataFrame from the embedding
PARAMETERS:
    - model_path: Path where the embedding is stoed
RETURN:
    - df: Pandas DataFrame
'''
def get_dataframe_in_embedding_space(model_path):
    graph_matrix = load_pickle(model_path)
    df = pd.DataFrame(graph_matrix)
    df = df.rename(lambda x: 'd_'+str(x), axis='columns')
    return df

'''
Build the Annoy index in order to compute afterwords the nearest neighbors
PARAMETERS:
    - df: DataFrame representing the embedding space
RETURN:
    - The annoy index
'''
def get_annoy_index(df):
    nb_trees = 100
    index = AnnoyIndex(df.shape[1], "euclidean")  # Length of item vector that will be indexed
    df.apply(lambda row: index.add_item(row.name, np.array(row)), axis = 1)
    index.build(nb_trees) # 100 trees
    return index


'''
Get the random walk distance computed from the random_walk_channels pairs. The random walk distance is the sum of the euclidean distance between the pairs of random_walk_channels.
PARAMETERS:
    - df_embedding: DataFrame representing the channels in the embedding space
    - embedding_type: Our project contains 2 embeddings, a user embedding and a artist embedding
RETURN:
    - random_walk_distance: The distance of the random walk
'''
def get_random_walk(df_embedding, embedding_type):
    if embedding_type == 'users':
        random_walk_channels = load_pickle('../data/random_users_tuples.pickle')
    else:
        random_walk_channels = load_pickle('../data/random_artists_tuples.pickle')
        
    random_walk_distance = 0
    for val in random_walk_channels:
        random_walk_distance += distance.euclidean(df_embedding.iloc[val[0]], df_embedding.iloc[val[1]])
    return random_walk_distance



'''
Get the position of ref_channel relative to second_channel in terms of its nearest neighbors ranking.
PARAMETER:
    - ref_channel: The reference channel on which we compute the k nearest neighbors
    - second_channel: The channel on which we compute the ranking relatively to ref_channel
    - index: Annoy index to compute the neirest neighbors
    - df_embedding: DataFrame representing the channels in the embedding space
RETURN: 
    - The position of second_channel relatively to ref_channel in terms of it's ranking
'''
def get_ranking_position_between_channels(ref_channel, second_channel, index, df_embedding):
    nearest_neighbors_index = index.get_nns_by_item(ref_channel, len(df_embedding))
    dist_k_th_nearest = distance.euclidean(df_embedding.iloc[ref_channel], 
                                           df_embedding.iloc[nearest_neighbors_index[len(nearest_neighbors_index)-1]])
    for i in range(0, len(nearest_neighbors_index)):
        if nearest_neighbors_index[i] == second_channel:
            return i
        
        
        
        
'''
Compute the Jumper ratio as well as the position ratio.
PARAMETER:
    - files: List of path where are stored the embedding
    - channels_tuple: The pairs of channels corresponding to the user jumper channel pairs
RETURN: 
    - user_jumper_tab: List of Jumper ratio corresponding the embedding contained in files
    - ranking_position_tab: List of Position ratio corresponding the embedding contained in files
'''
def get_user_walk_and_position_ratio(files, embedding_type, save_path):
    
    if embedding_type == 'users':
        channels_tuple = load_pickle('../data/users_tuples_artist_walk.pickle')
    else:
         channels_tuple = load_pickle('../data/artists_tuples_user_walk.pickle')
    
    len_random_set = len(channels_tuple)
    
    user_jumper_tab = []
    ranking_position_tab = []

    for file in files:     
        print('file ', file)                  
        df_embedding = get_dataframe_in_embedding_space(file)
        n_comp = df_embedding.shape[1]
        print('n_comp ', n_comp)
        random_walk_distance = get_random_walk(df_embedding, embedding_type)
        index = get_annoy_index(df_embedding)
        users_walk = 0
        ranking_position = 0
        
        for ind, channel in enumerate(channels_tuple):
            ref_channel = channel[0]
            second_channel = channel[1]
            # For every pair sum the users_walk and the ranking_position results
            users_walk += distance.euclidean(df_embedding.iloc[ref_channel], df_embedding.iloc[second_channel])
            ranking_position += get_ranking_position_between_channels(ref_channel, second_channel, index, df_embedding)

        user_jumper_tab.append(users_walk / random_walk_distance)
        ranking_position_tab.append(ranking_position / (df_embedding.shape[0]*len_random_set))
        
    save_to_pickle(user_jumper_tab, 'user_jumper_tab', save_path)
    save_to_pickle(ranking_position_tab, 'user_ranking_position_tab', save_path)
        
    return user_jumper_tab, ranking_position_tab
    

In [13]:
user_files = glob.glob("../data/user_embeddings/*.pickle")
user_jumper_tab, user_ranking_position_tab = get_user_walk_and_position_ratio(user_files, 'users', '../data/user_embeddings/')

file  ../data/user_embeddings/user_embedding_transformed_pca100.pickle
n_comp  100
file  ../data/user_embeddings/user_embedding_transformed_pca200.pickle
n_comp  200


KeyboardInterrupt: 

In [None]:
artist_files = glob.glob("../data/artist_embeddings/*.pickle")
artist_jumper_tab, artist_ranking_position_tab = get_user_walk_and_position_ratio(artist_files, 'artists', '../data/artist_embeddings/')

In [7]:
user_jumper_tab

[0.8802545346905915,
 0.9117009061461726,
 0.9342720251387152,
 0.7645890741687803,
 0.8161118158661519,
 0.8389139952430437]

In [8]:
user_ranking_position_tab

[0.29570588616201515,
 0.32108627800973566,
 0.33904781398072575,
 0.2262154909076576,
 0.2534895337411913,
 0.2661860812131006]

In [9]:
artist_jumper_tab

[0.11762417644235337,
 0.11864663748741885,
 0.12090121307717669,
 0.10096131499168012,
 0.10482059826160789,
 0.10976425009713582]

In [10]:
artist_ranking_position_tab

[0.07904092684367718,
 0.07633384751714857,
 0.0751011848107913,
 0.13934444845580754,
 0.1158905444878401,
 0.10788427582132659]