In [1]:
import pickle
import scipy.sparse

import numpy as np
import pandas as pd

from helper_functions import *
from scipy.sparse import dok_matrix

In [2]:
col_names_consumption = ['user-mboxsha1', 'musicbrainz-artist-id', 'artist-name', 'plays']

df = pd.read_csv('../data/usersha1-artmbid-artname-plays.tsv', sep='\t', names=col_names_consumption)
df = df.rename({'user-mboxsha1':'user_id', 'musicbrainz-artist-id':'artist_id'}, axis=1)
df = df[['user_id', 'artist_id', 'plays']]
df.head()

Unnamed: 0,user_id,artist_id,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,706


# Data Processing

## Filter the data

In [3]:
df.isna().sum()

user_id           0
artist_id    226137
plays             0
dtype: int64

In [4]:
df.shape

(17535655, 3)

In [6]:
# Since there are 1.29% of NaN for the artists, we remove all the rows from df that contains a NaN
df = df.dropna()
df.shape

(17309518, 3)

In [7]:
# Some plays are equal to 0 which is not normal, we delete these rows
df = df.loc[df['plays'] >= 1]

In [8]:
T1 = 1000  # Number of plays per user threshold
T2 = 50    # Number of artists per user threshold

T3 = 100 # Number of plays per artist threshold
T4 = 10  # Number of users per artist threshold

In [87]:
plays_per_user = df[['user_id', 'plays']].groupby('user_id').sum()
artists_per_user = df[['user_id', 'plays']].groupby('user_id').count().rename({'plays':'n_artists'}, axis=1)

plays_per_artist = df[['artist_id', 'plays']].groupby('artist_id').sum()
users_per_artist = df[['artist_id', 'plays']].groupby('artist_id').count().rename({'plays':'n_users'}, axis=1)

In [13]:
# Store the list of intesting users in a set for performance (membership of elem and intersection)

users_filtered = set(plays_per_user[plays_per_user['plays'] > T1].index)
print('Number of users after filtering with T1: {}'.format(len(users_filtered)))
users_filtered = users_filtered & set(artists_per_user[artists_per_user['n_artists'] > T2].index)
print('Number of users after filtering with T1 and T2: {}'.format(len(users_filtered)))

artists_filtered = set(plays_per_artist[plays_per_artist['plays'] > T3].index)
print('Number of artists after filtering with T3: {}'.format(len(artists_filtered)))
artists_filtered = artists_filtered & set(users_per_artist[users_per_artist['n_users'] > T4].index)
print('Number of artists after filtering with T3 and T4: {}'.format(len(artists_filtered)))

Number of users after filtering with T1: 315799
Number of users after filtering with T1 and T2: 89008
Number of artists after filtering with T3: 123227
Number of artists after filtering with T3 and T4: 60938


However, we would like to have a single matrix where we keep the users and the artists who all satisfy the 4 different thresholds. But since the thresholds are interdependent, removing users will also make us remove artists, and the other way around. Therefore we iterate on removing alternatively the users and the artists, until we converge to two subsets of artists and users which both satisfy the 4 thresholds.

In [103]:
num_iterations = 15
users_filtered_iteration = None
artists_filtered_iteration = None

plays_per_user_filtered = plays_per_user.copy()
artists_per_user_filtered = artists_per_user.copy()

plays_per_artist_filtered = plays_per_artist.copy()
users_per_artist_filtered = users_per_artist.copy()

previous_users_filtered_iteration = set(plays_per_user_filtered.index)
previous_artists_filtered_iteration = set(plays_per_artist_filtered.index)

for i in range(num_iterations):
    
    # Filter users
    users_filtered_iteration = set(plays_per_user_filtered[plays_per_user_filtered['plays'] > T1].index)
    users_filtered_iteration = users_filtered_iteration & set(artists_per_user_filtered[artists_per_user_filtered['n_artists'] > T2].index)
    users_filtered_iteration = users_filtered_iteration & previous_users_filtered_iteration
    
    temp = df[df['user_id'].isin(users_filtered_iteration)]
    temp = temp[temp['artist_id'].isin(previous_artists_filtered_iteration)]
    
    temp = temp[['artist_id', 'plays']].groupby('artist_id')
    plays_per_artist_filtered = temp.sum()
    users_per_artist_filtered = temp.count().rename({'plays':'n_users'}, axis=1)
    
    # Filter artists
    artists_filtered_iteration = set(plays_per_artist_filtered[plays_per_artist_filtered['plays'] > T3].index)
    artists_filtered_iteration = artists_filtered_iteration & set(users_per_artist_filtered[users_per_artist_filtered['n_users'] > T4].index)
    artists_filtered_iteration = artists_filtered_iteration & previous_artists_filtered_iteration
    
    temp = df[df['user_id'].isin(users_filtered_iteration)]
    temp = temp[temp['artist_id'].isin(artists_filtered_iteration)]
    temp = temp[['user_id', 'plays']].groupby('user_id')
    plays_per_user_filtered = temp.sum()
    artists_per_user_filtered = temp.count().rename({'plays':'n_artists'}, axis=1)
    
    print("iteration {}, num users {}, num artists {}".format(i+1, len(users_filtered_iteration), len(artists_filtered_iteration)))
    
    # Stop condition
    if previous_users_filtered_iteration == users_filtered_iteration and previous_artists_filtered_iteration == artists_filtered_iteration:
        print("Converged!")
        break
        
    previous_users_filtered_iteration = users_filtered_iteration.copy()
    previous_artists_filtered_iteration = artists_filtered_iteration.copy()
    

iteration 1, num users 89008, num artists 33333
iteration 2, num users 68472, num artists 26454
iteration 3, num users 62687, num artists 24492
iteration 4, num users 60636, num artists 23752
iteration 5, num users 59796, num artists 23426
iteration 6, num users 59427, num artists 23301
iteration 7, num users 59278, num artists 23238
iteration 8, num users 59212, num artists 23211
iteration 9, num users 59169, num artists 23194
iteration 10, num users 59147, num artists 23187
iteration 11, num users 59139, num artists 23182
iteration 12, num users 59135, num artists 23178
iteration 13, num users 59131, num artists 23178
iteration 14, num users 59131, num artists 23178
Converged!


## Storing data in sparse matrix

- One user per column
- One artist per row

In [10]:
# Dictionnary for retrieving the index of a user in the matrix in O(1)

user2id = {}
id2user = {}
for i, user in enumerate(sorted(users_filtered)):
    user2id[user] = i
    id2user[i] = user

artist2id = {}
id2artist = {}
for i, artist in enumerate(sorted(artists_filtered)):
    artist2id[artist] = i
    id2artist[i] = artist

In [11]:
# Create the sparse matrix
S = dok_matrix((len(artists_filtered), len(users_filtered)), dtype=np.uint8)

In [12]:
# Fill the data dictionnary
data = {}

for i, (index, row) in enumerate(df.iterrows()):
    
    if i % 1000000 == 0:
        print('Processed {} lines...'.format(i))
    
    user = row['user_id']
    artist = row['artist_id']
    
    # Keep only interesting users and artists
    if (user in users_filtered) and (artist in artists_filtered):
        
        data[(artist2id[artist], user2id[user])] = row['plays']

Processed 0 lines...
Processed 1000000 lines...
Processed 2000000 lines...
Processed 3000000 lines...
Processed 4000000 lines...
Processed 5000000 lines...
Processed 6000000 lines...
Processed 7000000 lines...
Processed 8000000 lines...
Processed 9000000 lines...
Processed 10000000 lines...
Processed 11000000 lines...
Processed 12000000 lines...
Processed 13000000 lines...
Processed 14000000 lines...
Processed 15000000 lines...
Processed 16000000 lines...
Processed 17000000 lines...


In [13]:
# store it in the sparse matrix
dict.update(S, data)
S = S.tocsr()

# free the memory
data = {}
del data

By using thresholds on both users and artists, some artists have have been played by only few users and vice versa. In order to fulfill the thresholds, we will use independent matrix for artist and user embeddings.

## Artist matrix plays

In [14]:
artists_to_keep = []
artist2id_am = {}
id2artist_am = {}
user2id_am = user2id
id2user_am = id2user

counts_per_artist = S.getnnz(axis = 1)

counter_artist_idx = 0
for idx, counts in enumerate(counts_per_artist):
    if counts >= T4:
        artists_to_keep.append(idx)
        id2artist_am[counter_artist_idx] = id2artist[idx]
        counter_artist_idx += 1
        
matrix_plays_am = S[artists_to_keep]

for (k, v) in id2artist_am.items():
    artist2id_am[v] = k

In [15]:
save_to_pickle(user2id_am, 'user2id_am', '../data/artist_embeddings/')
save_to_pickle(id2user_am, 'id2user_am', '../data/artist_embeddings/')

save_to_pickle(artist2id_am, 'artist2id_am', '../data/artist_embeddings/')
save_to_pickle(id2artist_am, 'id2artist_am', '../data/artist_embeddings/')

scipy.sparse.save_npz('../data/artist_embeddings/matrix_plays_am.npz', matrix_plays_am)

## User matrix plays

In [16]:
users_to_keep = []
artist2id_um = artist2id
id2artist_um = id2artist
user2id_um = {}
id2user_um = {}


counts_per_user = (S.T).getnnz(axis = 1)

counter_user_idx = 0
for idx, counts in enumerate(counts_per_user):
    if counts >= T2:
        users_to_keep.append(idx)
        id2user_um[counter_user_idx] = id2user[idx]
        counter_user_idx += 1
        
matrix_plays_um = S.T[users_to_keep]

for k, v in id2user_um.items():
    user2id_um[v] = k

In [17]:
save_to_pickle(user2id_um, 'user2id_um', '../data/user_embeddings/')
save_to_pickle(id2user_um, 'id2user_um', '../data/user_embeddings/')

save_to_pickle(artist2id_um, 'artist2id_um', '../data/user_embeddings/')
save_to_pickle(id2artist_um, 'id2artist_um', '../data/user_embeddings/')

scipy.sparse.save_npz('../data/user_embeddings/matrix_plays_um.npz', matrix_plays_um)

## Compute random user jumper and user jumper for the artist embedding

In [18]:
NB_SAMPLE = 5000

In [19]:
'''
Random user jumper: Selects pair of random artists.
'''

selected_artists = np.random.choice(np.arange(matrix_plays_am.shape[0]), size = (NB_SAMPLE, 2), replace=False)
artists_tuples = []
for tup in selected_artists:
    artists_tuples.append((tup[0], tup[1]))
save_to_pickle(artists_tuples, 'random_user_jumper_tuples', '../data/artist_embeddings/')

In [20]:
'''
User jumper: Select NB_SAMPLE users from the set of users and for each user selected, we select 2 artists
'''

selected_users = list(set(np.random.randint(matrix_plays_am.shape[1] - 1, size=int(2*NB_SAMPLE))))

matrix_plays_am_filter = matrix_plays_am[:, selected_users]

# Create and store artists tuples
artists_tuples = []
for i in range(matrix_plays_am_filter.shape[1]):
    idx = matrix_plays_am_filter[:, i].nonzero()[0]
    
    if len(idx) >= 2:
        selected_artists = np.random.choice(idx, 2, replace=False)
        artists_tuples.append((selected_artists[0], selected_artists[1]))
    if len(artists_tuples) >= NB_SAMPLE:
        break
    
save_to_pickle(artists_tuples, 'user_jumper_tuples', '../data/artist_embeddings/')

## Compute random artist jumper and artist jumper for the user embedding

In [21]:
'''
Random artist jumper: Selects pair of random users.
'''
selected_users = np.random.choice(np.arange(matrix_plays_um.shape[0]), size = (NB_SAMPLE, 2), replace=False)
users_tuples = []
for tup in selected_users:
    users_tuples.append((tup[0], tup[1]))
save_to_pickle(users_tuples, 'random_artist_jumper_tuples', '../data/user_embeddings/')

In [22]:
'''
Artist jumper: Select NB_SAMPLE artists from the set of users and for each artist selected, we select 2 users.
'''

selected_artists = list(set(np.random.randint(matrix_plays_um.shape[1] - 1, size=int(2*NB_SAMPLE))))

matrix_plays_um_filter = matrix_plays_um[:, selected_artists]

# Create and store artists tuples
users_tuples = []
for i in range(matrix_plays_um_filter.shape[1]):
    idx = matrix_plays_um_filter[:, i].nonzero()[0]
    if len(idx) >= 2:
        selected_users = np.random.choice(idx, 2, replace=False)
        users_tuples.append((selected_users[0], selected_users[1]))
    if len(users_tuples) >= NB_SAMPLE:
        break
    
save_to_pickle(users_tuples, 'artist_jumper_tuples', '../data/user_embeddings/')