### Exploring GCN's for node prediction in a music playlist.  Predicting if an artist is part of playlist.  

Given 10 random artists from the 'rock classics' playlist and individual graphs of size 400, can node classification methods outperform PageRank with respect to predicting/labeling artists in the graph that are also in the original playlist.  Given the same parameters, ranking artists by PageRank will correctly label anywhere from ~35% to ~40% of artists in this playlist (see 'Exploring Weighted Edges'.)


In [1]:
import music_graph as mg    # contains all necessary functions, data and dependencies

from music_graph import artist_dictionary
from music_graph import tags_dictionary
from music_graph import filtered_tags_dictionary
from music_graph import ratings
from music_graph import build_net
from music_graph import layer_graphs
from music_graph import new_centrality
from music_graph import spotify_rankings
from music_graph import add_tag_edges

from datetime import datetime
import networkx as nx
import configparser
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import model_selection

import tensorflow as tf

from stellargraph import StellarGraph as sg
from tensorflow import keras
from stellargraph.layer import RGCN
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from stellargraph.mapper import RelationalFullBatchNodeGenerator
# Spotify client id & secret for API access

config = configparser.ConfigParser()
config.read('spotify.ini')

client_id = config['spotify']['client_id']
client_secret = config['spotify']['client_secret']

In [2]:
playlists = {
'4C9mWYjVobPsfFXesGxYNf':'all classic hits', ### $$$
#'5TF1ki4BzMFfotw57juFTY':'coffee shop mix',
'3pM6OEFeTo4L1yd4eRltjL':'2004 hits', ### $$$
'73gFKr9Moh1T3jXM8TVSdC':'80s summer mix',
'17xTDziQjqAZANvg6C6FlX':'songs from your old ipod',
'6j3URIMhpf62LXpDcbi98b':'rap playlists',
#'1DcX9eK1MaPlwwBIsEfnBX':'country playlist to end all playlists',
#'5VhEpH7stoZiX4v8gIb77P':'playlists',
'37i9dQZF1DWXRqgorJj26U':'rock classics', ### $$$
'37i9dQZF1DWWJOmJ7nRx0C':'rock hard',  ### $$$
'37i9dQZF1DWWRktbhJiuqL':'pure rock n roll', ### $$$
'37i9dQZF1DX5W4wuxak2hE':'power pop', ###
'7Jfcy1H82lsTIzhpL4MZXu':'punk playlist', ### $$$
'07jaITMGGL6QYmT8yZVxQt':'metal playlist'
}  

"""
Request tracks from a playlist in Spotify
Then pull artist out of the track container.
"""
url = 'https://open.spotify.com/playlist'
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id, client_secret))

playlistid = '37i9dQZF1DWXRqgorJj26U'
playlist_name = playlists[playlistid]

# playlists with more than 100 artists will require two requests
playlist1 = sp.playlist_tracks(playlistid, offset = 0, limit = 100) # get the first 100 tracks
playlist2 = sp.playlist_tracks(playlistid, offset = 100, limit = 100) # get the next 100 tracks

artists = []

for index, item in enumerate(playlist1['items']):
    artists.append(item['track']['album']['artists'][0]['name'])
    
for index, item in enumerate(playlist2['items']):
    artists.append(item['track']['album']['artists'][0]['name'])
    
artists = [i.lower() for i in artists]

num_artists = len(set(artists))      # number of unique artists in the playlist

print(len(set(artists)), ' unique artists')
print(len((artists)), ' total artists') 

81  unique artists
150  total artists


In [3]:
rand_artist_count = 0
rand_artists = []
while rand_artist_count < 10:                  # number of random artists
    rand = np.random.randint(0,num_artists,1)
    rand = rand[0]
    if artists[rand] not in rand_artists: #and artists[rand] not in ['mumford & sons','bløf','lil jon & the east side boyz']:
        rand_artists.append(artists[rand].lower())
        rand_artist_count +=1

print("multigraph iteration: ")
print(rand_artists)
print(datetime.now())
multigraph = layer_graphs(rand_artists, add_tags = True, size = 400) # node count of each graph

print(list(multigraph.edges.data())[1])  # print an edge w/ data to make sure edge weight is correct.


multigraph iteration: 
['the who', 'bruce springsteen', 'john fogerty', "guns n' roses", 'queen', 'the rolling stones', 'the hollies', 'heart', 'van halen', 'ac/dc']
2020-06-17 19:09:53.144088
guns n' roses  not in artist_dictionary
('the who', 'the who', {'kind': 'Artist', 'link': 'kenney jones', 'weight': 0.14263571429482658})


In [4]:
print(len(multigraph.nodes()))
print(len(multigraph.edges.data()))

2462
857426


In [5]:
# Stellargraph's implementation of GCN wants to see attributes for each node in the graph.
# several graph metrics will be calculated and used as node features.

scores = new_centrality(multigraph)

In [6]:
scores_ = scores  # maintain oroginal scores DF

scores_.head()

Unnamed: 0,artist,deg_cent,load_cent,page_rank,ev_cent,close_cent,cf_close_cent,harm_cent,mean_cent
0,the who,2.54165,0.004364,0.003631,0.079079,0.549576,0.005668,1522.5,217.954852
1,spooky tooth,0.44616,0.006228,0.000983,0.012822,0.494375,0.005605,1348.833333,192.828501
2,pete townshend,1.964242,0.00074,0.002103,0.062641,0.543507,0.005664,1508.166667,215.820795
3,ben townshend,0.002438,0.0,6.2e-05,7.3e-05,0.354918,0.001682,913.666667,130.57512
4,the casbah club,0.016254,1.7e-05,6.3e-05,0.00025,0.363569,0.003991,940.416667,134.400116


In [7]:
scores_ = scores_.set_index('artist')

In [8]:
# convert to a dictionary of dictionaries to be assigned as node attributes by networkx.
#node_scores = scores_.to_dict('index')
#node_scores['the sonics']

# convert DF to a dictionary of key:list(values)

node_scores = {}
for i in scores_.index:
    node_scores[i] = list(scores_.loc[i])
    
node_scores['led zeppelin']

[2.9049167005282404,
 0.0017330945883184024,
 0.0033224858640377745,
 0.08828818304070167,
 0.5564096766900294,
 0.005669418048112229,
 1552.9999999999834,
 222.36576279410613]

In [9]:
multigraph_ = multigraph.copy()
multigraph_
print(len(multigraph_.nodes()))
print(len(multigraph_.edges.data()))

2462
857426


In [10]:
# assign scores tyo each 
nx.set_node_attributes(multigraph_, node_scores, 'features')


# check that they match
print(multigraph_.nodes['the firm'])

print(scores_.loc['the firm'])

{'features': [1.4782608695652173, 0.01491769335571129, 0.0020513295715999637, 0.043075514433539526, 0.5537803780378038, 0.00565877849555681, 1544.499999999983, 220.94253493763463]}
deg_cent            1.478261
load_cent           0.014918
page_rank           0.002051
ev_cent             0.043076
close_cent          0.553780
cf_close_cent       0.005659
harm_cent        1544.500000
mean_cent         220.942535
Name: the firm, dtype: float64


In [11]:
multigraph_.nodes.data()



In [12]:
"""stellargraph from networkx graph
The networkx graph is a heterogeneous (has both artist and user-tag type edges) and undirected multigraph.
Node attributes consist of various graph metrics, although the user-tag edges added to the graph via the layer_graph 
function probably could/should be node attributes instead of edges."""

graph = sg.from_networkx(multigraph_,
                         node_features = 'features',
                         edge_type_attr = 'kind')

print(graph.info())


StellarGraph: Undirected multigraph
 Nodes: 2462, Edges: 857426

 Node types:
  default: [2462]
    Features: float32 vector, length 8
    Edge types: default-Artist->default, default-User-Tag->default

 Edge types:
    default-User-Tag->default: [801297]
        Weights: all 0.05
        Features: none
    default-Artist->default: [56129]
        Weights: range=[6.3786e-07, 0.731488], mean=0.0145673, std=0.0416824
        Features: none


In [13]:
# identity matrix for multigraph

id_mtx = np.identity(n = len(multigraph_.nodes()))

id_mtx

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [46]:
# labels for node classification.  will have just 10 labeled nodes and |V | - 10 unlabeled nodes.
labels = pd.DataFrame()
labels['artist'] = list(multigraph.nodes())
labels['in_playlist'] = np.nan

for i in np.arange(0,len(labels)):
    if labels['artist'][i] in  artists:
        labels['in_playlist'][i] = 1
    else:
        labels['in_playlist'][i] = 0

In [47]:
# check that randomly selected artists are correctly labeled
print(labels[labels['in_playlist'] == 1])

labels = labels.set_index('artist') 
labels.head(n = 20)

                            artist  in_playlist
0                          the who          1.0
16                            free          1.0
34                       the clash          1.0
83                     the beatles          1.0
200             the rolling stones          1.0
286                             u2          1.0
368                         eagles          1.0
389                        journey          1.0
406                    bad company          1.0
413                          cream          1.0
435                  the guess who          1.0
453                     supertramp          1.0
467                     pink floyd          1.0
488                          heart          1.0
498                the moody blues          1.0
502                          queen          1.0
577                     whitesnake          1.0
618                  black sabbath          1.0
635                  fleetwood mac          1.0
683            derek & the dominos      

Unnamed: 0_level_0,in_playlist
artist,Unnamed: 1_level_1
the who,1.0
spooky tooth,0.0
pete townshend,0.0
ben townshend,0.0
the casbah club,0.0
the high numbers,0.0
plastic ono band,0.0
the lightning seeds,0.0
a.s.a.p.,0.0
oasis,0.0


In [48]:
train_targets, test_targets = model_selection.train_test_split(
    labels, train_size=0.8, test_size=None
)

print(type(train_targets))
print(type(test_targets))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [49]:
tf.compat.v1.disable_eager_execution() # needed to fix conversion error, but why?



generator = RelationalFullBatchNodeGenerator(graph, sparse=True)

train_gen = generator.flow(train_targets.index, targets=train_targets)
test_gen = generator.flow(test_targets.index, targets=test_targets)

print(type(train_gen))
print(type(test_gen))

<class 'stellargraph.mapper.sequences.RelationalFullBatchNodeSequence'>
<class 'stellargraph.mapper.sequences.RelationalFullBatchNodeSequence'>


In [50]:
rgcn = RGCN(
    layer_sizes=[32, 32],
    activations=["relu", "relu"],
    generator=generator,
    bias=True,
    num_bases=20,
    dropout=0.5,
)

In [51]:
x_in, x_out = rgcn.in_out_tensors()
#x_in = np.asarray(x_in)
predictions = Dense(2, activation="softmax")(x_out)
model = Model(inputs=x_in, outputs=predictions)
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(0.01),
    metrics=["acc"],
)


print(type(x_in))
print(type(x_out))

<class 'list'>
<class 'tensorflow.python.framework.ops.Tensor'>


In [52]:
# 
history = model.fit(train_gen, validation_data=test_gen, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [53]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 0.3269
	acc: 0.9797
