## Import

In [1]:
import pickle
import scipy.sparse
import time

import numpy as np
import pandas as pd

from helper_functions import *
from scipy.sparse import dok_matrix
from sklearn.preprocessing import normalize

In [2]:
# Load precomputed data
user2id = load_pickle('../data/user2id.pickle')
id2user = load_pickle('../data/id2user.pickle')
artist2id = load_pickle('../data/artist2id.pickle')
id2artist = load_pickle('../data/id2artist.pickle')

matrix_plays = scipy.sparse.load_npz('../data/matrix_plays.npz')

In [3]:
# Number of artists and users
nb_artists = len(artist2id)
nb_users = len(user2id)

# Artist-Level embedding

## Tf-IDF

In [4]:
plays_per_artists = matrix_plays.sum(axis=1)

In [5]:
# Dictionnary containing artitst, user tuple
graph_dict = {}

start = time.time()
for user_idx in range(nb_users):

    user_row = matrix_plays.T[user_idx]
    _, non_zero_artists = user_row.nonzero()
    idf = np.log(nb_artists / len(non_zero_artists))

    for artist_idx in non_zero_artists:
        tf = user_row[0, artist_idx] / plays_per_artists[artist_idx, 0]
        graph_dict[artist_idx, user_idx] = tf * idf

    if (user_idx % 10000 == 0) and (user_idx != 0):
        print('Processed {} users... ({:.2f} s)'.format(user_idx,
                                                        time.time() - start))
        start = time.time()


graph_matrix = dok_matrix((nb_artists, nb_users))
dict.update(graph_matrix, graph_dict)
graph_dict = {}

# Save sparse matrix
scipy.sparse.save_npz('../data/artist_embeddings/artists_embedding.npz', graph_matrix.tocsr())

Processed 10000 users... (85.73 s)
Processed 20000 users... (95.31 s)
Processed 30000 users... (90.31 s)
Processed 40000 users... (92.60 s)
Processed 50000 users... (84.06 s)
Processed 60000 users... (73.93 s)


## Number of occurence of plays (normalized)

In [6]:
artists_emb_normalized = normalize(matrix_plays, norm='l1', axis=1)
scipy.sparse.save_npz('../data/artist_embeddings/artists_embedding_norm.npz',
                      artists_emb_normalized.tocsr())

# User-Level embedding

## TF-IDF

In [7]:
matrix_plays_t = matrix_plays.T

In [8]:
plays_per_user = matrix_plays_t.sum(axis=1)

In [9]:
# Dictionnary containing user, artist tuple
graph_dict = {}

start = time.time()
for artist_idx in range(nb_artists):
    artist_row = matrix_plays_t.T[artist_idx]
    _, non_zero_users = artist_row.nonzero()
    idf = np.log((nb_users + 1) / (len(non_zero_users) + 1))

    for user_idx in non_zero_users:
        tf = (artist_row[0, user_idx] + 1) / (plays_per_user[user_idx, 0] + 1)
        graph_dict[user_idx, artist_idx] = tf * idf

    if (artist_idx % 10000 == 0) and (artist_idx != 0):
        print('Processed {} artists... ({:.2f} s)'.format(
            artist_idx,
            time.time() - start))
        start = time.time()


graph_matrix = dok_matrix((nb_users, nb_artists))
dict.update(graph_matrix, graph_dict)
graph_dict = {}

# Save sparse matrix
scipy.sparse.save_npz('../data/user_embeddings/users_embedding.npz', graph_matrix.tocsr())

Processed 10000 artists... (60.91 s)
Processed 20000 artists... (64.65 s)


## Number of occurence of plays (normalized)

In [10]:
user_emb_normalized = normalize(matrix_plays_t, norm='l1', axis=1)
scipy.sparse.save_npz('../data/user_embeddings/users_embedding_norm.npz',
                      user_emb_normalized.tocsr())