### Exploring GCN's for node prediction in a music playlist.  Predicting if an artist is part of playlist.  

Given 10 random artists from the 'rock classics' playlist and individual graphs of size 400, can node classification methods outperform PageRank with respect to predicting/labeling artists in the graph that are also in the original playlist.  Given the same parameters, ranking artists by PageRank will correctly label anywhere from ~35% to ~40% of artists in this playlist (see 'Exploring Weighted Edges'.)


In [1]:
import music_graph as mg    # contains all necessary functions, data and dependencies

from music_graph import artist_dictionary
from music_graph import tags_dictionary
from music_graph import filtered_tags_dictionary
from music_graph import ratings
from music_graph import build_net
from music_graph import layer_graphs
from music_graph import new_centrality
from music_graph import spotify_rankings
from music_graph import add_tag_edges

from datetime import datetime
import networkx as nx
import configparser
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import model_selection

import tensorflow as tf

from stellargraph import StellarGraph as sg
from tensorflow import keras
from stellargraph.layer import RGCN
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from stellargraph.mapper import RelationalFullBatchNodeGenerator
# Spotify client id & secret for API access

config = configparser.ConfigParser()
config.read('spotify.ini')

client_id = config['spotify']['client_id']
client_secret = config['spotify']['client_secret']

In [2]:
playlists = {
'4C9mWYjVobPsfFXesGxYNf':'all classic hits', ### $$$
#'5TF1ki4BzMFfotw57juFTY':'coffee shop mix',
'3pM6OEFeTo4L1yd4eRltjL':'2004 hits', ### $$$
'73gFKr9Moh1T3jXM8TVSdC':'80s summer mix',
'17xTDziQjqAZANvg6C6FlX':'songs from your old ipod',
'6j3URIMhpf62LXpDcbi98b':'rap playlists',
#'1DcX9eK1MaPlwwBIsEfnBX':'country playlist to end all playlists',
#'5VhEpH7stoZiX4v8gIb77P':'playlists',
'37i9dQZF1DWXRqgorJj26U':'rock classics', ### $$$
'37i9dQZF1DWWJOmJ7nRx0C':'rock hard',  ### $$$
'37i9dQZF1DWWRktbhJiuqL':'pure rock n roll', ### $$$
'37i9dQZF1DX5W4wuxak2hE':'power pop', ###
'7Jfcy1H82lsTIzhpL4MZXu':'punk playlist', ### $$$
'07jaITMGGL6QYmT8yZVxQt':'metal playlist'
}  

"""
Request tracks from a playlist in Spotify
Then pull artist out of the track container.
"""
url = 'https://open.spotify.com/playlist'
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id, client_secret))

playlistid = '37i9dQZF1DWXRqgorJj26U'
playlist_name = playlists[playlistid]

# playlists with more than 100 artists will require two requests
playlist1 = sp.playlist_tracks(playlistid, offset = 0, limit = 100) # get the first 100 tracks
playlist2 = sp.playlist_tracks(playlistid, offset = 100, limit = 100) # get the next 100 tracks

artists = []

for index, item in enumerate(playlist1['items']):
    artists.append(item['track']['album']['artists'][0]['name'])
    
for index, item in enumerate(playlist2['items']):
    artists.append(item['track']['album']['artists'][0]['name'])
    
artists = [i.lower() for i in artists]

num_artists = len(set(artists))      # number of unique artists in the playlist

print(len(set(artists)), ' unique artists')
print(len((artists)), ' total artists') 

81  unique artists
150  total artists


In [3]:
rand_artist_count = 0
rand_artists = []
while rand_artist_count < 10:                  # number of random artists
    rand = np.random.randint(0,num_artists,1)
    rand = rand[0]
    if artists[rand] not in rand_artists: #and artists[rand] not in ['mumford & sons','bløf','lil jon & the east side boyz']:
        rand_artists.append(artists[rand].lower())
        rand_artist_count +=1

print("multigraph iteration: ")
print(rand_artists)
print(datetime.now())
multigraph = layer_graphs(rand_artists, add_tags = True, size = 100) # node count of each graph

print(list(multigraph.edges.data())[1])  # print an edge w/ data to make sure edge weight is correct.


multigraph iteration: 
['ted nugent', 'led zeppelin', 'mötley crüe', 'stevie nicks', 'journey', 'ac/dc', 'david bowie', 'the rolling stones', 'talking heads', "guns n' roses"]
2020-06-21 15:48:36.258618
guns n' roses  not in artist_dictionary
('ted nugent', 'the amboy dukes', {'kind': 'Artist', 'link': 'rob grange', 'weight': 0.008995740370545664})


In [4]:
print(len(multigraph.nodes()))
print(len(multigraph.edges.data()))

743
135712


In [5]:
# Stellargraph's implementation of GCN wants to see attributes for each node in the graph.
# several graph metrics will be calculated and used as node features.

scores = new_centrality(multigraph)

In [6]:
scores_ = scores  # maintain oroginal scores DF

scores_.head()

Unnamed: 0,artist,deg_cent,load_cent,page_rank,ev_cent,close_cent,cf_close_cent,harm_cent,mean_cent
0,ted nugent,2.696765,0.015508,0.006697,0.094492,0.595028,0.018222,501.166667,72.084768
1,the amboy dukes,1.402965,0.006179,0.003382,0.052868,0.531519,0.018109,441.333333,63.335479
2,thin lizzy,2.525606,0.026881,0.007091,0.091698,0.593125,0.018215,499.833333,71.87085
3,black star riders,0.386792,0.002912,0.000925,0.01329,0.5218,0.017511,430.333333,61.610938
4,ozzy osbourne,3.458221,0.01459,0.010884,0.124839,0.599838,0.018249,504.5,72.675232


In [7]:
scores_ = scores_.set_index('artist')

In [8]:
# convert to a dictionary of dictionaries to be assigned as node attributes by networkx.
#node_scores = scores_.to_dict('index')
#node_scores['the sonics']

# convert DF to a dictionary of key:list(values)

node_scores = {}
for i in scores_.index:
    node_scores[i] = list(scores_.loc[i])
    
node_scores['led zeppelin']

[3.743935309973046,
 0.018499085685456634,
 0.01056474351444009,
 0.13822143683731125,
 0.6037428803905615,
 0.018257334637989436,
 506.83333333333286,
 73.05236487491024]

In [9]:
multigraph_ = multigraph.copy()
multigraph_
print(len(multigraph_.nodes()))
print(len(multigraph_.edges.data()))

2456
862193


In [10]:
# assign scores tyo each 
nx.set_node_attributes(multigraph_, node_scores, 'features')


# check that they match
print(multigraph_.nodes['the firm'])

print(scores_.loc['the firm'])

{'features': [1.4024439918533604, 0.014546490077776643, 0.002011960796505514, 0.042233217176769526, 0.5615279048490394, 0.0064215797252844785, 1556.166666666653, 222.59940740159027]}
deg_cent            1.402444
load_cent           0.014546
page_rank           0.002012
ev_cent             0.042233
close_cent          0.561528
cf_close_cent       0.006422
harm_cent        1556.166667
mean_cent         222.599407
Name: the firm, dtype: float64


In [None]:
multigraph_.nodes.data()

In [11]:
"""stellargraph from networkx graph
The networkx graph is a heterogeneous (has both artist and user-tag type edges) and undirected multigraph.
Node attributes consist of various graph metrics, although the user-tag edges added to the graph via the layer_graph 
function probably could/should be node attributes instead of edges."""

graph = sg.from_networkx(multigraph_,
                         node_features = 'features',
                         edge_type_attr = 'kind')

print(graph.info())


StellarGraph: Undirected multigraph
 Nodes: 2456, Edges: 862193

 Node types:
  default: [2456]
    Features: float32 vector, length 8
    Edge types: default-Artist->default, default-User-Tag->default

 Edge types:
    default-User-Tag->default: [798562]
        Weights: all 0.05
        Features: none
    default-Artist->default: [63631]
        Weights: range=[6.3786e-07, 0.731488], mean=0.015429, std=0.0420871
        Features: none


In [12]:
# identity matrix for multigraph

id_mtx = np.identity(n = len(multigraph_.nodes()))

id_mtx

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [13]:
# labels for node classification.  will have just 10 labeled nodes and |V | - 10 unlabeled nodes.
labels = pd.DataFrame()
labels['artist'] = list(multigraph.nodes())
labels['in_playlist'] = np.nan

for i in np.arange(0,len(labels)):
    if labels['artist'][i] in  artists:
        labels['in_playlist'][i] = 1
    else:
        labels['in_playlist'][i] = 0

In [14]:
# check that randomly selected artists are correctly labeled
print(labels[labels['in_playlist'] == 1])

labels = labels.set_index('artist') 
labels.head(n = 20)

                            artist  in_playlist
0                           kansas          1.0
2                      deep purple          1.0
46                      whitesnake          1.0
64                   black sabbath          1.0
124                  ozzy osbourne          1.0
246                     quiet riot          1.0
268               blue öyster cult          1.0
284                     ted nugent          1.0
301                        journey          1.0
355                      van halen          1.0
472             the rolling stones          1.0
486                     pink floyd          1.0
489                          queen          1.0
523                    bad company          1.0
524                           free          1.0
531                           rush          1.0
584                         eagles          1.0
585                  the guess who          1.0
600                  fleetwood mac          1.0
624                          ac/dc      

Unnamed: 0_level_0,in_playlist
artist,Unnamed: 1_level_1
kansas,1.0
flying colors,0.0
deep purple,1.0
steve morse band,0.0
dixie dregs,0.0
living loud,0.0
angelfire,0.0
shooting star,0.0
sun ra arkestra,0.0
loft,0.0


In [15]:
train_targets, test_targets = model_selection.train_test_split(
    labels, train_size=0.8, test_size=None
)

print(type(train_targets))
print(type(test_targets))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [16]:
tf.compat.v1.disable_eager_execution() # needed to fix conversion error, but why?



generator = RelationalFullBatchNodeGenerator(graph, sparse=True)

train_gen = generator.flow(train_targets.index, targets=train_targets)
test_gen = generator.flow(test_targets.index, targets=test_targets)

print(type(train_gen))
print(type(test_gen))

<class 'stellargraph.mapper.sequences.RelationalFullBatchNodeSequence'>
<class 'stellargraph.mapper.sequences.RelationalFullBatchNodeSequence'>


In [17]:
rgcn = RGCN(
    layer_sizes=[32, 32],
    activations=["relu", "relu"],
    generator=generator,
    bias=True,
    num_bases=20,
    dropout=0.5,
)

In [19]:
x_in, x_out = rgcn.in_out_tensors()
#x_in = np.asarray(x_in)
predictions = Dense(2, activation="relu")(x_out)
model = Model(inputs=x_in, outputs=predictions)
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(0.01),
    metrics=["acc"],
)


print(type(x_in))
print(type(x_out))

<class 'list'>
<class 'tensorflow.python.framework.ops.Tensor'>


In [20]:
# 
history = model.fit(train_gen, validation_data=test_gen, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 0.2621
	acc: 0.9837
