In [16]:
import pickle
import scipy.sparse

import numpy as np
import pandas as pd

from helper_functions import *
from scipy.sparse import dok_matrix

In [2]:
col_names_consumption = ['user-mboxsha1', 'musicbrainz-artist-id', 'artist-name', 'plays']

df = pd.read_csv('../data/usersha1-artmbid-artname-plays.tsv', sep='\t', names=col_names_consumption)
df = df.rename({'user-mboxsha1':'user_id', 'musicbrainz-artist-id':'artist_id'}, axis=1)
df = df[['user_id', 'artist_id', 'plays']]
df.head()

Unnamed: 0,user_id,artist_id,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,706


In [None]:
# Useful to compute tf-idf weights: compute the number of artists for each user
nb_artists_per_user = {}
for user in df['user_id'].unique():
    nb_artists_per_user[user] = df[df['user_id'] == user]['plays'].sum()
# Store data
with open('nb_artists_per_user.pkl', 'wb') as handle:
    pickle.dump(nb_artists_per_user, handle)
    
# Free memory
nb_artists_per_user = 0

# Data Processing

## Filter the data

In [4]:
df.isna().sum()

user_id           0
artist_id    226137
plays             0
dtype: int64

In [5]:
df.shape

(17535655, 3)

In [6]:
# Since there are 1.29% of NaN for the artists, we remove all the rows from df that contains a NaN
df = df.dropna()
df.shape

(17309518, 3)

In [13]:
T1 = 1000  # Number of plays per user threshold
T2 = 50    # Number of artists per user threshold

T3 = 100 # Number of plays per artist threshold
T4 = 10  # Number of users per artist threshold

In [14]:
plays_per_user = df[['user_id', 'plays']].groupby('user_id').sum()
artists_per_user = df[['user_id', 'plays']].groupby('user_id').count().rename({'plays':'n_artists'}, axis=1)

plays_per_artist = df[['artist_id', 'plays']].groupby('artist_id').sum()
users_per_artist = df[['artist_id', 'plays']].groupby('artist_id').count().rename({'plays':'n_users'}, axis=1)

In [15]:
# Store the list of intesting users in a set for performance (membership of elem and intersection)

users_filtered = set(plays_per_user[plays_per_user['plays'] > T1].index)
print('Number of users after filtering with T1: {}'.format(len(users_filtered)))
users_filtered = users_filtered & set(artists_per_user[artists_per_user['n_artists'] > T2].index)
print('Number of users after filtering with T1 and T2: {}'.format(len(users_filtered)))

artists_filtered = set(plays_per_artist[plays_per_artist['plays'] > T3].index)
print('Number of artists after filtering with T3: {}'.format(len(artists_filtered)))
artists_filtered = artists_filtered & set(users_per_artist[users_per_artist['n_users'] > T4].index)
print('Number of artists after filtering with T3 and T4: {}'.format(len(artists_filtered)))

Number of users after filtering with T1: 315799
Number of users after filtering with T1 and T2: 89008
Number of artists after filtering with T3: 123227
Number of artists after filtering with T3 and T4: 60938


## Storing data in sparse matrix

- One user per column
- One artist per row

In [17]:
# Dictionnary for retrieving the index of a user in the matrix in O(1)

user2id = {}
id2user = {}
for i, user in enumerate(sorted(users_filtered)):
    user2id[user] = i
    id2user[i] = user

artist2id = {}
id2artist = {}
for i, artist in enumerate(sorted(artists_filtered)):
    artist2id[artist] = i
    id2artist[i] = artist

In [18]:
# Create the sparse matrix
S = dok_matrix((len(artists_filtered), len(users_filtered)), dtype=np.uint8)

In [19]:
# Fill the data dictionnary
data = {}


for i, (index, row) in enumerate(df.iterrows()):
    
    if i % 1000000 == 0:
        print('Processed {} lines...'.format(i))
    
    user = row['user_id']
    artist = row['artist_id']
    
    # Keep only interesting users and artists
    if (user in users_filtered) and (artist in artists_filtered):
        
        data[(artist2id[artist], user2id[user])] = row['plays']

Processed 0 lines...
Processed 1000000 lines...
Processed 2000000 lines...
Processed 3000000 lines...
Processed 4000000 lines...
Processed 5000000 lines...
Processed 6000000 lines...
Processed 7000000 lines...
Processed 8000000 lines...
Processed 9000000 lines...
Processed 10000000 lines...
Processed 11000000 lines...
Processed 12000000 lines...
Processed 13000000 lines...
Processed 14000000 lines...
Processed 15000000 lines...
Processed 16000000 lines...
Processed 17000000 lines...


In [20]:
# store it in the sparse matrix
dict.update(S, data)

# free the memory
data = {}
del data

In [21]:
S = S.tocsr()
scipy.sparse.save_npz('../data/matrix_plays.npz', S)

In [22]:
save_to_pickle(user2id, 'user2id', '../data/')
save_to_pickle(id2user, 'id2user', '../data/')

save_to_pickle(artist2id, 'artist2id', '../data/')
save_to_pickle(id2artist, 'id2artist', '../data/')

In [23]:
# Load data
user2id = load_pickle('../data/user2id.pickle')
id2user = load_pickle('../data/id2user.pickle')
artist2id = load_pickle('../data/artist2id.pickle')
id2artist = load_pickle('../data/id2artist.pickle')