## Import

In [1]:
import pickle
import scipy.sparse
import time

import numpy as np
import pandas as pd

from helper_functions import *
from scipy.sparse import dok_matrix
from sklearn.preprocessing import normalize

# Artist-Level embedding

In [2]:
# Load precomputed data
user2id_am = load_pickle('../data/artist_embeddings/user2id_am.pickle')
id2user_am = load_pickle('../data/artist_embeddings/id2user_am.pickle')
artist2id_am = load_pickle('../data/artist_embeddings/artist2id_am.pickle')
id2artist_am = load_pickle('../data/artist_embeddings/id2artist_am.pickle')

matrix_plays_am = scipy.sparse.load_npz('../data/artist_embeddings/matrix_plays_am.npz')

## Tf-IDF

In [3]:
plays_per_artists = matrix_plays_am.sum(axis=1)

In [4]:
# Number of artists and users
nb_artists = len(artist2id_am)
nb_users = len(user2id_am)

# Dictionnary containing artitst, user tuple
graph_dict = {}

start = time.time()
for user_idx in range(nb_users):

    user_row = matrix_plays_am.T[user_idx]
    _, non_zero_artists = user_row.nonzero()
    idf = np.log(nb_artists / len(non_zero_artists))

    for artist_idx in non_zero_artists:
        tf = user_row[0, artist_idx] / plays_per_artists[artist_idx, 0]
        graph_dict[artist_idx, user_idx] = tf * idf

    if (user_idx % 10000 == 0) and (user_idx != 0):
        print('Processed {} users... ({:.2f} s)'.format(user_idx,
                                                        time.time() - start))
        start = time.time()


graph_matrix = dok_matrix((nb_artists, nb_users))
dict.update(graph_matrix, graph_dict)
graph_dict = {}

# Save sparse matrix
scipy.sparse.save_npz('../data/artist_embeddings/artists_embedding.npz', graph_matrix.tocsr())

Processed 10000 users... (93.86 s)
Processed 20000 users... (96.20 s)
Processed 30000 users... (100.00 s)
Processed 40000 users... (104.09 s)
Processed 50000 users... (100.69 s)
Processed 60000 users... (101.52 s)
Processed 70000 users... (99.40 s)
Processed 80000 users... (93.61 s)


## Number of occurence of plays (normalized)

In [5]:
artists_emb_normalized = normalize(matrix_plays_am, norm='l1', axis=1)
scipy.sparse.save_npz('../data/artist_embeddings/artists_embedding_norm.npz',
                      artists_emb_normalized.tocsr())

# User-Level embedding

In [6]:
# Load precomputed data
user2id_um = load_pickle('../data/user_embeddings/user2id_um.pickle')
id2user_um = load_pickle('../data/user_embeddings/id2user_um.pickle')
artist2id_um = load_pickle('../data/user_embeddings/artist2id_um.pickle')
id2artist_um = load_pickle('../data/user_embeddings/id2artist_um.pickle')

matrix_plays_um = scipy.sparse.load_npz('../data/user_embeddings/matrix_plays_um.npz')

## TF-IDF

In [16]:
plays_per_user = matrix_plays_um.sum(axis=1)

In [17]:
# Number of artists and users
nb_artists = len(artist2id_um)
nb_users = len(user2id_um)

# Dictionnary containing user, artist tuple
graph_dict = {}

start = time.time()
for artist_idx in range(nb_artists):
    artist_row = matrix_plays_um.T[artist_idx]
    _, non_zero_users = artist_row.nonzero()
    idf = np.log((nb_users + 1) / (len(non_zero_users) + 1))

    for user_idx in non_zero_users:
        tf = (artist_row[0, user_idx] + 1) / (plays_per_user[user_idx, 0] + 1)
        graph_dict[user_idx, artist_idx] = tf * idf

    if (artist_idx % 10000 == 0) and (artist_idx != 0):
        print('Processed {} artists... ({:.2f} s)'.format(
            artist_idx,
            time.time() - start))
        start = time.time()


graph_matrix = dok_matrix((nb_users, nb_artists))
dict.update(graph_matrix, graph_dict)
graph_dict = {}

# Save sparse matrix
scipy.sparse.save_npz('../data/user_embeddings/users_embedding.npz', graph_matrix.tocsr())

Processed 10000 artists... (41.02 s)
Processed 20000 artists... (35.31 s)
Processed 30000 artists... (37.27 s)
Processed 40000 artists... (36.60 s)
Processed 50000 artists... (36.39 s)
Processed 60000 artists... (33.71 s)


## Number of occurence of plays (normalized)

In [18]:
user_emb_normalized = normalize(matrix_plays_um, norm='l1', axis=1)
scipy.sparse.save_npz('../data/user_embeddings/users_embedding_norm.npz',
                      user_emb_normalized.tocsr())