### Exploring GCN's for node prediction in a music playlist.  Predicting if an artist is part of playlist.  

Given 10 random artists from the 'rock classics' playlist and individual graphs of size 400, can node classification methods outperform PageRank with respect to predicting/labeling artists in the graph that are also in the original playlist.  Given the same parameters, ranking artists by PageRank will correctly label anywhere from ~35% to ~40% of artists in this playlist (see 'Exploring Weighted Edges'.)


In [43]:
import music_graph as mg    # contains all necessary functions, data and dependencies

from music_graph import artist_dictionary
from music_graph import tags_dictionary
from music_graph import filtered_tags_dictionary
from music_graph import ratings
from music_graph import build_net
from music_graph import layer_graphs
from music_graph import new_centrality
from music_graph import spotify_rankings
from music_graph import add_tag_edges

from datetime import datetime
import networkx as nx
import configparser
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing

import tensorflow as tf
from tensorflow.keras import layers, optimizers, losses, metrics, Model

from stellargraph import StellarGraph as sg
from tensorflow import keras
from stellargraph.layer import RGCN, GCN
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from stellargraph.mapper import RelationalFullBatchNodeGenerator,FullBatchNodeGenerator
# Spotify client id & secret for API access

config = configparser.ConfigParser()
config.read('spotify.ini')

client_id = config['spotify']['client_id']
client_secret = config['spotify']['client_secret']

In [2]:
playlists = {
'4C9mWYjVobPsfFXesGxYNf':'all classic hits', ### $$$
#'5TF1ki4BzMFfotw57juFTY':'coffee shop mix',
'3pM6OEFeTo4L1yd4eRltjL':'2004 hits', ### $$$
'73gFKr9Moh1T3jXM8TVSdC':'80s summer mix',
'17xTDziQjqAZANvg6C6FlX':'songs from your old ipod',
'6j3URIMhpf62LXpDcbi98b':'rap playlists',
#'1DcX9eK1MaPlwwBIsEfnBX':'country playlist to end all playlists',
#'5VhEpH7stoZiX4v8gIb77P':'playlists',
'37i9dQZF1DWXRqgorJj26U':'rock classics', ### $$$
'37i9dQZF1DWWJOmJ7nRx0C':'rock hard',  ### $$$
'37i9dQZF1DWWRktbhJiuqL':'pure rock n roll', ### $$$
'37i9dQZF1DX5W4wuxak2hE':'power pop', ###
'7Jfcy1H82lsTIzhpL4MZXu':'punk playlist', ### $$$
'07jaITMGGL6QYmT8yZVxQt':'metal playlist'
}  

"""
Request tracks from a playlist in Spotify
Then pull artist out of the track container.
"""
url = 'https://open.spotify.com/playlist'
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id, client_secret))

playlistid = '37i9dQZF1DWXRqgorJj26U'
playlist_name = playlists[playlistid]

# playlists with more than 100 artists will require two requests
playlist1 = sp.playlist_tracks(playlistid, offset = 0, limit = 100) # get the first 100 tracks
playlist2 = sp.playlist_tracks(playlistid, offset = 100, limit = 100) # get the next 100 tracks

artists = []

for index, item in enumerate(playlist1['items']):
    artists.append(item['track']['album']['artists'][0]['name'])
    
for index, item in enumerate(playlist2['items']):
    artists.append(item['track']['album']['artists'][0]['name'])
    
artists = [i.lower() for i in artists]

num_artists = len(set(artists))      # number of unique artists in the playlist

print(len(set(artists)), ' unique artists')
print(len((artists)), ' total artists') 

81  unique artists
150  total artists


In [160]:
"""
rand_artist_count = 0
rand_artists = []
while rand_artist_count < 10:                  # number of random artists
    rand = np.random.randint(0,num_artists,1)
    rand = rand[0]
    if artists[rand] not in rand_artists: #and artists[rand] not in ['mumford & sons','bløf','lil jon & the east side boyz']:
        rand_artists.append(artists[rand].lower())
        rand_artist_count +=1
"""

print("multigraph iteration: ")
#print(rand_artists)
print(datetime.now())
multigraph = layer_graphs(list(set(artists)), add_tags = False, size = 50) # node count of each graph, no user-tags

print(list(multigraph.edges.data())[1])  # print an edge w/ data to make sure edge weight is correct.


multigraph iteration: 
2020-06-25 13:51:28.522134
janis joplin  not in artist_dictionary
joe walsh  not in artist_dictionary
joan jett & the blackhearts  not in artist_dictionary
guns n' roses  not in artist_dictionary
('t. rex', 'the pretty things', {'kind': 'Artist', 'link': 'jack green', 'weight': 0.013434608500633395})


In [161]:
print(len(multigraph.nodes()))
print(len(multigraph.edges.data()))

2534
46965


In [162]:
# Stellargraph's implementation of GCN wants to see attributes for each node in the graph.
# several graph metrics will be calculated and used as node features.

scores = new_centrality(multigraph)

In [163]:
scores_ = scores  # maintain oroginal scores DF

scores_.head()

Unnamed: 0,artist,deg_cent,load_cent,page_rank,ev_cent,close_cent,harm_cent,mean_cent
0,t. rex,0.010265,0.006138,0.001481,4e-06,0.211225,583.209127,97.239707
1,the pretty things,0.029214,0.007836,0.001364,1.4e-05,0.233885,643.227778,107.250015
2,mickey finn’s t-rex,0.003158,0.000789,0.000184,2e-06,0.193245,522.353571,87.091825
3,sam gopal,0.017371,0.004001,0.000335,5.6e-05,0.22097,607.721825,101.327426
4,the mickey finn,0.006317,0.001508,0.000151,0.000217,0.224222,615.369841,102.600376


In [164]:
scores_ = scores_.set_index('artist')

In [165]:
# convert to a dictionary of dictionaries to be assigned as node attributes by networkx.
#node_scores = scores_.to_dict('index')
#node_scores['the sonics']

# convert DF to a dictionary of key:list(values)

node_scores = {}
for i in scores_.index:
    node_scores[i] = list(scores_.loc[i])
    
print(node_scores['journey'])
scores[scores['artist'] == 'journey']

[0.13501776549545993, 0.06892304266789966, 0.010237036874181453, 0.04268440852298308, 0.30598930177315814, 879.3940476190593, 146.65948319573218]


Unnamed: 0,artist,deg_cent,load_cent,page_rank,ev_cent,close_cent,harm_cent,mean_cent
1219,journey,0.135018,0.068923,0.010237,0.042684,0.305989,879.394048,146.659483


In [166]:
multigraph_ = multigraph.copy()
multigraph_
print(len(multigraph_.nodes()))
print(len(multigraph_.edges.data()))

2534
46965


In [167]:
# assign scores tyo each 
nx.set_node_attributes(multigraph_, node_scores, 'features')


# check that they match
print(multigraph_.nodes['the firm'])

print(scores_.loc['the firm'])

{'features': [0.06316620607974734, 0.0342643412089125, 0.0008403525283481212, 0.012948082140122183, 0.2827279574237174, 811.2309523809457, 135.27081655338776]}
deg_cent        0.063166
load_cent       0.034264
page_rank       0.000840
ev_cent         0.012948
close_cent      0.282728
harm_cent     811.230952
mean_cent     135.270817
Name: the firm, dtype: float64


In [None]:
multigraph_.nodes.data()

In [169]:
"""stellargraph from networkx graph
The networkx graph is a heterogeneous (has both artist and user-tag type edges) and undirected multigraph.
Node attributes consist of various graph metrics, although the user-tag edges added to the graph via the layer_graph 
function probably could/should be node attributes instead of edges."""

graph = sg.from_networkx(multigraph_,
                         node_features = 'features',
                         edge_type_attr = 'kind')

print(graph.info())


StellarGraph: Undirected multigraph
 Nodes: 2534, Edges: 46965

 Node types:
  default: [2534]
    Features: float32 vector, length 7
    Edge types: default-Artist->default

 Edge types:
    default-Artist->default: [46965]
        Weights: range=[6.3786e-07, 0.794869], mean=0.0184919, std=0.0520207
        Features: none


In [170]:
# identity matrix for multigraph

id_mtx = np.identity(n = len(multigraph_.nodes()))

id_mtx

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [171]:
# labels for node classification.  will have just 10 labeled nodes and |V | - 10 unlabeled nodes.
labels = pd.DataFrame()
labels['artist'] = list(multigraph.nodes())
labels['in_playlist'] = np.int

for i in np.arange(0,len(labels)):
    if labels['artist'][i] in  artists:
        labels['in_playlist'][i] = 1
    else:
        labels['in_playlist'][i] = 0

In [172]:
# check that randomly selected artists are correctly labeled
print(labels[labels['in_playlist'] == 1])

labels = labels.set_index('artist') 
labels.head(n = 20)

                  artist in_playlist
0                 t. rex           1
22             the kinks           1
24    the rolling stones           1
50           the hollies           1
69         black sabbath           1
...                  ...         ...
2315          don henley           1
2366   bruce springsteen           1
2416           radiohead           1
2470       grateful dead           1
2514          don mclean           1

[75 rows x 2 columns]


Unnamed: 0_level_0,in_playlist
artist,Unnamed: 1_level_1
t. rex,1
the pretty things,0
mickey finn’s t-rex,0
sam gopal,0
the mickey finn,0
pink fairies,0
john's children,0
toby tyler,0
legend,0
the dreamers,0


In [173]:
pos_class = labels[labels['in_playlist'] == 1]
pos_class

Unnamed: 0_level_0,in_playlist
artist,Unnamed: 1_level_1
t. rex,1
the kinks,1
the rolling stones,1
the hollies,1
black sabbath,1
...,...
don henley,1
bruce springsteen,1
radiohead,1
grateful dead,1


In [187]:
train_subjects, test_subjects = model_selection.train_test_split(
    labels, train_size=1000, test_size=None
)
val_subjects, test_subjects = model_selection.train_test_split(
    test_subjects, train_size=500, test_size=None
)
print(type(train_subjects))
print(type(test_subjects))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [188]:
train_subjects.describe()


Unnamed: 0,in_playlist
count,1000
unique,2
top,0
freq,968


In [189]:
tf.compat.v1.disable_eager_execution() # needed to fix conversion error, but why?

#target_encoding = preprocessing.LabelBinarizer()

#train_targets = target_encoding.fit_transform(train_subjects)
train_targets = train_subjects
#val_targets = target_encoding.transform(val_subjects)
val_targets = val_subjects
#test_targets = target_encoding.transform(test_subjects)
test_targets = test_subjects

generator = FullBatchNodeGenerator(graph, method="gcn")

train_gen = generator.flow(train_subjects.index, train_targets)

print(type(train_subjects))
print(type(test_subjects))

Using GCN (local pooling) filters...
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [190]:
gcn = GCN(
    layer_sizes=[16, 16], activations=["relu", "relu"], generator=generator, dropout=0.5
)

In [217]:
x_inp, x_out = gcn.in_out_tensors()

x_out
x_inp

[<tf.Tensor 'input_33:0' shape=(1, 2534, 7) dtype=float32>,
 <tf.Tensor 'input_34:0' shape=(1, None) dtype=int32>,
 <tf.Tensor 'input_35:0' shape=(1, None, 2) dtype=int64>,
 <tf.Tensor 'input_36:0' shape=(1, None) dtype=float32>]

In [213]:
#predictions = layers.Dense(units=2, activation="softmax")(x_out)
predictions = Dense(2, activation="softmax")(x_out)

model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer=optimizers.Adam(lr=0.01),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)


print(type(x_inp))
print(type(x_out))

<class 'list'>
<class 'tensorflow.python.framework.ops.Tensor'>


In [214]:
val_gen = generator.flow(val_subjects.index, val_targets)

In [215]:
from tensorflow.keras.callbacks import EarlyStopping

es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True)

In [216]:
history = model.fit(
    train_gen,
    epochs=200,
    validation_data=val_gen,
    verbose=2,
    shuffle=False,  # this should be False, since shuffling data means shuffling the whole graph
    callbacks=[es_callback],
)

Epoch 1/200


ValueError: You are passing a target array of shape (1, 1000, 1) while using as loss `categorical_crossentropy`. `categorical_crossentropy` expects targets to be binary matrices (1s and 0s) of shape (samples, classes). If your targets are integer classes, you can convert them to the expected format via:
```
from keras.utils import to_categorical
y_binary = to_categorical(y_int)
```

Alternatively, you can use the loss function `sparse_categorical_crossentropy` instead, which does expect integer targets.

In [197]:
test_gen = generator.flow(test_subjects.index, test_targets)

In [198]:
test_metrics = model.evaluate(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 0.6929
	acc: 0.9720


In [199]:
labels.head()

Unnamed: 0_level_0,in_playlist
artist,Unnamed: 1_level_1
t. rex,1
the pretty things,0
mickey finn’s t-rex,0
sam gopal,0
the mickey finn,0


In [200]:
all_nodes = labels.index
all_gen = generator.flow(all_nodes)
all_predictions = model.predict(all_gen)

all_predictions

array([[[8.8084212e-31, 0.0000000e+00],
        [1.9031539e-38, 0.0000000e+00],
        [3.4484057e-21, 3.6150875e-34],
        ...,
        [1.6947372e-21, 1.1505955e-34],
        [1.6231037e-16, 1.5075910e-26],
        [1.5156935e-14, 2.4395998e-23]]], dtype=float32)

In [201]:
node_predictions = all_predictions.squeeze()

for i in node_predictions:
    if i[0] < 0.90:
        print(i[0])

8.808421e-31
1.9031539e-38
3.4484057e-21
4.653155e-35
3.4165658e-26
5.5587695e-17
4.5199234e-23
1.928114e-14
3.4652676e-12
7.655118e-36
1.2204423e-18
1.6787076e-38
2.7608274e-21
1.2204657e-18
6.6738568e-21
4.927672e-17
2.2031654e-28
9.1975666e-26
8.3949656e-15
1.7691559e-24
3.5529134e-13
2.2105253e-11
0.0
3.5851336e-20
0.0
0.0
3.4593307e-27
4.8096267e-19
7.1917974e-25
2.6862876e-13
5.2192307e-14
2.845253e-27
1.5024447e-29
4.698501e-18
2.279912e-26
5.6998182e-12
5.7004813e-12
2.033748e-31
6.5815243e-29
3.155769e-35
9.608982e-24
1.7212284e-21
1.7212284e-21
1.7212348e-21
6.643456e-20
2.2664976e-18
2.266489e-18
3.952552e-18
3.952552e-18
6.892558e-16
4.2087667e-27
1.7352527e-34
5.390794e-37
1.4005526e-14
1.4004672e-14
1.4569967e-11
1.2207587e-26
3.748475e-32
0.0
3.7485322e-32
7.7035286e-13
9.65565e-16
9.655282e-16
2.506533e-28
3.7867647e-12
5.8505426e-27
0.0
1.2929349e-25
1.01142905e-22
0.0
1.3274227e-19
1.0881265e-15
6.2120795e-12
1.5830528e-21
1.5830468e-21
1.5830468e-21
1.5830528e-21
1.4

5.702725e-22
4.4230117e-10
1.0220903e-24
5.410094e-23
5.4101356e-23
1.4877598e-20
1.7933863e-24
3.879284e-25
1.8514681e-10
4.0932524e-15
9.0323886e-23
1.0994496e-24
6.6958893e-21
6.9993717e-29
1.8467183e-24
1.4710579e-24
1.8467183e-24
9.411479e-26
2.1015074e-26
1.8467325e-24
1.8467325e-24
3.7565197e-17
3.1409967e-20
9.027935e-25
1.7460099e-26
1.570096e-24
3.446212e-27
3.446212e-27
1.570078e-24
3.446133e-27
1.570108e-24
2.4810233e-25
2.4810136e-25
2.4810136e-25
9.892003e-27
2.4810043e-25
9.891814e-27
2.4810136e-25
2.4809853e-25
3.9933265e-11
4.611116e-21
4.6107992e-21
2.5381148e-18
8.4529835e-19
8.453112e-19
8.453048e-19
1.0635168e-14
1.0635168e-14
4.6116785e-21
4.6116434e-21
2.5384055e-18
8.449695e-19
8.4500175e-19
8.450211e-19
8.453564e-19
8.453628e-19
8.453467e-19
1.9814788e-19
2.3137444e-15
5.9586e-23
5.959259e-23
3.1490756e-17
5.700927e-12
1.3759189e-16
3.3165868e-23
5.096119e-22
1.0662428e-21
5.7007203e-12
3.3730183e-13
2.313858e-16
2.313858e-16
3.981264e-15
5.840583e-17
2.3137256

In [196]:
all_nodes = labels.index
all_gen = generator.flow(all_nodes)
all_predictions = model.predict(all_gen)
node_predictions = all_predictions.squeeze()
df = pd.DataFrame({"Predicted": node_predictions, "True": labels['in_playlist']})
df.head(20)

Exception: Data must be 1-dimensional