# Word2vec to embed the artists and the users

## Setup

In [2]:
import pickle
import scipy.sparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from gensim.models import Word2Vec
import hdbscan

from helper_functions import load_pickle



In [3]:
col_names_consumption = ['user-mboxsha1', 'musicbrainz-artist-id', 'artist-name', 'plays']

df = pd.read_csv('../data/usersha1-artmbid-artname-plays.tsv', sep='\t', names=col_names_consumption)
df = df.rename({'user-mboxsha1':'user_id', 'musicbrainz-artist-id':'artist_id', 'artist-name': 'artist_name'}, axis=1)
df.head()

Unnamed: 0,user_id,artist_id,artist_name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


In [25]:
col_names_user = ['user-mboxsha1', 'gender', 'age', 'country', 'signup']
df_user = pd.read_csv('../data/usersha1-profile.tsv', sep='\t', names=col_names_user)
df_user.head()

Unnamed: 0,user-mboxsha1,gender,age,country,signup
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007"
1,00001411dc427966b17297bf4d69e7e193135d89,f,,Canada,"Dec 4, 2007"
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,,,Germany,"Sep 1, 2006"
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0,Mexico,"Apr 28, 2008"
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0,United States,"Jan 27, 2006"


In [5]:
# Load precomputed user-artist matrix
user2id = load_pickle('../data/user2id.pickle')
id2user = load_pickle('../data/id2user.pickle')
artist2id = load_pickle('../data/artist2id.pickle')
id2artist = load_pickle('../data/id2artist.pickle')

matrix_plays = scipy.sparse.load_npz('../data/matrix_plays.npz')

In [6]:
# Densify the matrix
dense_matrix_plays = matrix_plays.todense()
dense_matrix_plays.shape

(45076, 104897)

In [7]:
window_size = 10

def create_texts(matrix, dict_=id2artist, window_size=window_size):
    """Adapt the matrix to the format of word2vec, i.e., list of list of words (here artists or users)"""
    
    texts = []
    
    for i in tqdm(range(len(matrix))):
    
        row = np.array(matrix[i])[0]
        indices_non_zero = row.nonzero()[0]
        values_non_zero = row[indices_non_zero]
        values_non_zero = values_non_zero/values_non_zero.sum()

        indices = np.random.choice(indices_non_zero, size=window_size, replace=True, p=values_non_zero)

        texts.append([dict_.get(id_) for id_ in indices])
    
    return texts

## artist2vec

In [89]:
matrix = dense_matrix_plays.T
matrix.shape

(104897, 45076)

In [90]:
texts_artists = create_texts(matrix, id2artist)

100%|██████████| 104897/104897 [03:49<00:00, 456.28it/s]


In [91]:
# Create the Word2Vec model
model = Word2Vec(sentences=texts_artists, window=window_size, workers=8)
model.save("models/artist2vec.model")

In [92]:
def k_most_similar(artist_name, k=10):
    """Print the k most similar artists based on the word2vec model."""
    
    artist_id = artistid2artistname[artistid2artistname["artist_name"] == artist_name].artist_id
    
    if len(artist_id) == 0:
        print("No artist with the name {} found...".format(artist_name))
    else:
        most_similar = model.wv.most_similar(artist_id, topn=k)
        most_similar = np.array([list(ele) for ele in most_similar])
        for artist_id in most_similar[:,0]:
            print(list(artistid2artistname[artistid2artistname["artist_id"] == artist_id].artist_name))

In [93]:
k_most_similar('keith jarrett')

['bill evans', 'bill evans.']
['duke ellington']
['chick corea']
['dave brubeck quartet', 'davve brubeck quartet', 'the 1987 dave brubeck quartet', 'the dave brubeck quartet']
['oascar peterson', 'oscar peterson']
['basie count', 'cont basie', 'count basie']
['stan getz']
['sonny rollins']
['montgomery wes', 'wes montgomery']
['john scofield']


In [94]:
k_most_similar('jeff beck')

['alvin lee & ten years after', 'ten years after']
['free']
['rory gallagher']
['buddy guy']
['derek & the dominos', 'derek and the dominos']
['cream']
['johnny winter']
['traffic']
['sá, rodrix & guarabyra', 'sá, rodrix e guarabyra']
["gov't mule", 'govt mule', 'gov´t mule']


In [95]:
k_most_similar('eminem')

['50 cent']
['snoog dogg', 'snoop', 'snoop dogg', 'snoop doggy dog', 'snoop doggy dogg', 'snoop doggy dogg featuring jd']
['2 pac', '2pac', 'tupac', 'tupak shakur']
['will smith']
['petey pablo']
['peter gelderblom']
['ababy bash', 'baby bash']
['run dmc', 'run-d.m.c.', 'run-dmc']
['fort minor']
['bruce faulconer']


In [96]:
# Compute the matrix containing all the vectors of artists
artists_vectors = []
artists_vectors_id = []
for idx, artist in enumerate(model.wv.vocab):
    artists_vectors.append(model.wv[artist])
    artists_vectors_id.append(artist)

artists_vectors = np.array(artists_vectors)
artists_vectors_id = np.array(artists_vectors_id)

In [97]:
# Normalize the features
artists_vectors_norm = (artists_vectors - artists_vectors.mean(axis=0))/artists_vectors.std(axis=0)

In [98]:
# Cluster the artists
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=15)
clusterer.fit(artists_vectors_norm)
hdbscan_labels = clusterer.labels_
hdbscan_centers = clusterer.exemplars_

In [62]:
pd.Series(hdbscan_labels).value_counts()

-1    19346
dtype: int64

Clusters obtained are not really convincing.

## user2vec

In [8]:
# Densify the matrix
matrix = dense_matrix_plays.copy()
matrix.shape

(45076, 104897)

In [9]:
texts_users = create_texts(matrix, id2user)

100%|██████████| 45076/45076 [00:23<00:00, 1913.04it/s]


In [10]:
# Create the Word2Vec model
model = Word2Vec(sentences=texts_users, window=30, workers=8)
model.save("models/user2vec.model")

In [None]:
def most_similar_users(user_id, k=10):
    """Return a dataframe containing the demographic information on the most similar users of the user_id"""
    
    most_similar = model.wv.most_similar(user_id, topn=k)
    most_similar = np.array([list(ele) for ele in most_similar])[:,0]
    
    df_similar_users = df_user[df_user['user-mboxsha1'].isin(most_similar)]
    df_similar_users = df_similar_users[['gender','age','country']]
    
    return df_similar_users

In [69]:
# Find similar users with similar demographic information
user_id = list(user2id.keys())[4]
display(df_user[df_user['user-mboxsha1'] == user_id])
df_similar_users = most_similar_users(user_id, k=100)
display(df_similar_users.gender.value_counts())
print("")
display(df_similar_users.age.value_counts().sort_index())
print("")
display(df_similar_users.country.value_counts().sort_index())

Unnamed: 0,user-mboxsha1,gender,age,country,signup
11,0001a57568309b287363e72dc682e9a170ba6dc2,,23.0,United States,"May 12, 2007"


m    68
f    25
Name: gender, dtype: int64




12.0     1
15.0     1
17.0     2
18.0     4
19.0     5
20.0     5
21.0    13
22.0     6
23.0     4
24.0     5
25.0     8
26.0     6
27.0     3
28.0     1
29.0     3
30.0     2
31.0     2
32.0     2
33.0     1
35.0     1
36.0     1
39.0     2
41.0     1
45.0     2
47.0     1
Name: age, dtype: int64




Australia                1
Belarus                  1
Belgium                  2
Brazil                   4
Bulgaria                 1
Canada                   1
Chile                    1
Christmas Island         1
Croatia                  2
Czech Republic           4
Estonia                  1
Finland                  5
France                   2
Germany                 10
Guatemala                1
Ireland                  3
Japan                    2
Mexico                   1
Myanmar                  1
Netherlands              3
Netherlands Antilles     1
New Zealand              1
Norway                   3
Paraguay                 1
Poland                   3
Portugal                 1
Russian Federation       8
Serbia                   2
Spain                    4
Sweden                   2
Switzerland              2
Turkey                   3
United Kingdom           7
United States           15
Name: country, dtype: int64

In [38]:
# Compute the matrix containing all the vectors of users
users_vectors = []
user_vectors_id = []
for idx, user in enumerate(model.wv.vocab):
    users_vectors.append(model.wv[user])
    user_vectors_id.append(user)

users_vectors = np.array(users_vectors)
user_vectors_id = np.array(user_vectors_id)

In [39]:
# Cluster the users
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=10)
clusterer.fit(users_vectors)
hdbscan_labels = clusterer.labels_
hdbscan_centers = clusterer.exemplars_

In [40]:
pd.Series(hdbscan_labels).value_counts()

-1    36471
 0      200
 1       11
dtype: int64

Clusters obtained are not really convincing.