Relevant imports and initialization code.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from keras.utils import to_categorical

seed = 25
np.random.seed(seed)

This section reads in our data from our Spotify track dataset. This dataset has a large amount of data that's captured per track.

In [2]:
df = pd.read_csv('./datasets/spotify_songs.csv')
# we only want a certain number of columns. Reducing unneeded features will improve performance.
df = df[['genre','artist_name','track_id','popularity','acousticness','danceability','duration_ms','energy','instrumentalness','key','liveness','loudness','mode','speechiness','tempo','time_signature','valence']]
df.drop_duplicates('track_id', inplace=True)
df = df.sample(frac=0.02, random_state=seed)

print(len(df['genre'].unique()))
print(len(df['key'].unique()))
print(len(df['mode'].unique()))
print(len(df['time_signature'].unique()))

df.describe()

27
12
2
4


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0
mean,35.586421,0.408049,0.537186,234653.5,0.557125,0.178248,0.224884,-10.167798,0.131284,116.940285,0.453908
std,17.704144,0.369598,0.193352,134667.0,0.279653,0.326769,0.211943,6.39707,0.209738,31.29785,0.268361
min,0.0,3e-06,0.057,34053.0,0.00125,0.0,0.0119,-47.046,0.0228,37.934,0.0225
25%,24.0,0.0458,0.403,174598.0,0.33,0.0,0.0973,-12.9835,0.0369,91.846,0.218
50%,36.0,0.278,0.556,216933.0,0.597,6.7e-05,0.131,-8.133,0.05,115.056,0.447
75%,48.0,0.804,0.683,266333.0,0.793,0.122,0.272,-5.655,0.1065,138.0305,0.673
max,91.0,0.996,0.954,3059427.0,0.998,0.985,0.996,-0.366,0.964,220.119,0.989


Let's read in our other dataset that contains user created playlist with songs with them. This will be crucial to generating a desired y label that we will want our neural network to train on.

In [3]:
playlist_df = pd.read_json('./datasets/challenge_set.json')
playlist_series = playlist_df['playlists']

Note that the code before does not output any information about the genre or track id. This is because in the dataset they are string values.

This code section rearranges the dataset to be more compatible with machine learning. One aspect of this is converting unique string values into a numeric equivalent. The Tensorflow normalization layer will handle proper normalization after that.

In [4]:
def encode_values(col):
    unique_items = col.unique().tolist()
    items_to_encoded = {x: i for i, x in enumerate(unique_items)}
    encoded_to_items = {i: x for i, x in enumerate(unique_items)}
    return (items_to_encoded, encoded_to_items)

genre_items_to_encoded, genre_encoded_to_items = encode_values(df['genre'])
df['genre'] = df['genre'].map(genre_items_to_encoded)

artist_name_items_to_encoded, artist_name_encoded_to_items = encode_values(df['artist_name'])
df['artist_name'] = df['artist_name'].map(artist_name_items_to_encoded)

track_id_items_to_encoded, track_id_encoded_to_items = encode_values(df['track_id'])
df['track_id'] = df['track_id'].map(track_id_items_to_encoded)

key_items_to_encoded, key_encoded_to_items = encode_values(df['key'])
df['key'] = df['key'].map(key_items_to_encoded)

mode_items_to_encoded, mode_encoded_to_items = encode_values(df['mode'])
df['mode'] = df['mode'].map(mode_items_to_encoded)

time_signature_items_to_encoded, time_signature_encoded_to_items = encode_values(df['time_signature'])
df['time_signature'] = df['time_signature'].map(time_signature_items_to_encoded)

df.describe()

Unnamed: 0,genre,artist_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0,3535.0
mean,10.971429,912.655446,1767.0,35.586421,0.408049,0.537186,234653.5,0.557125,0.178248,5.695615,0.224884,-10.167798,0.342857,0.131284,116.940285,0.950778,0.453908
std,7.098429,651.637088,1020.610928,17.704144,0.369598,0.193352,134667.0,0.279653,0.326769,3.736454,0.211943,6.39707,0.474731,0.209738,31.29785,0.496574,0.268361
min,0.0,0.0,0.0,0.0,3e-06,0.057,34053.0,0.00125,0.0,0.0,0.0119,-47.046,0.0,0.0228,37.934,0.0,0.0225
25%,5.0,332.0,883.5,24.0,0.0458,0.403,174598.0,0.33,0.0,2.0,0.0973,-12.9835,0.0,0.0369,91.846,1.0,0.218
50%,10.0,811.0,1767.0,36.0,0.278,0.556,216933.0,0.597,6.7e-05,6.0,0.131,-8.133,0.0,0.05,115.056,1.0,0.447
75%,16.0,1448.5,2650.5,48.0,0.804,0.683,266333.0,0.793,0.122,9.0,0.272,-5.655,1.0,0.1065,138.0305,1.0,0.673
max,26.0,2233.0,3534.0,91.0,0.996,0.954,3059427.0,0.998,0.985,11.0,0.996,-0.366,1.0,0.964,220.119,3.0,0.989


Here we build a pseudo histogram where we capture the frequency in which a particular song is in a playlist with another song. In training, this will generate the y label based on how frequent a song is found with another one.

In [5]:
train_indices = int(0.9 * df.shape[0])
train_df = df.iloc[:train_indices]
train_track_id_items_to_encoded = {key: value for key, value in list(track_id_items_to_encoded.items())[:train_indices]}

encoded_track_histogram = {int(row['track_id']): set() for index, row in train_df.iterrows()}
for playlist in playlist_series:
    playlist_tracks = [track['track_uri'].split(':')[2] for track in playlist['tracks']]
    included_tracks = [track for track in playlist_tracks if track in train_track_id_items_to_encoded]
    encoded_tracks = [train_track_id_items_to_encoded[track] for track in included_tracks]
    
    if (len(encoded_tracks) > 1):
        for x in encoded_tracks:
            for y in encoded_tracks:
                encoded_track_histogram[x].add(y)

x_data = df.values.astype(np.float32)
y_data = []

for index, row in train_df.iterrows():
    probability = np.zeros(len(encoded_track_histogram))
    histogram_data = encoded_track_histogram[row['track_id']]
    for histogram_datapoint in histogram_data:
        probability[histogram_datapoint] = probability[histogram_datapoint] + (1 / len(histogram_data))
    y_data.append(probability)

x_train = x_data[:train_indices]
x_test = x_data[train_indices:]
y_train = np.array(y_data)[:train_indices]
y_test = np.array(y_data)[train_indices:]


This section builds the model.

In [6]:
def build_and_compile_model(features, num_classes):
    normal_layer = tf.keras.layers.Normalization(axis=-1)
    normal_layer.adapt(features)

    model = keras.Sequential([
      normal_layer,
      keras.layers.Dense(64, activation='relu'),
      keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [9]:
from numpy import savetxt
savetxt('data.npy', x_train)
model = build_and_compile_model(x_train, len(x_train))
model.summary()

model.fit(x_train, y_train, epochs=20, batch_size=200, verbose=2) #  validation_data=(x_test, y_test)

print(x_train.shape)
print(x_train.shape)
print(x_test.shape)
print(y_test.shape)
scores = model.evaluate(x_test, y_test)
print(scores)

(3181, 17)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_3 (Normaliza  (None, 17)                35        
 tion)                                                           
                                                                 
 dense_4 (Dense)             (None, 64)                1152      
                                                                 
 dense_5 (Dense)             (None, 3181)              206765    
                                                                 
Total params: 207952 (812.32 KB)
Trainable params: 207917 (812.18 KB)
Non-trainable params: 35 (144.00 Byte)
_________________________________________________________________
Epoch 1/20
16/16 - 0s - loss: 0.6333 - accuracy: 3.1437e-04 - 325ms/epoch - 20ms/step
Epoch 2/20
16/16 - 0s - loss: 0.6096 - accuracy: 9.4310e-04 - 66ms/epoch - 4ms/step
Epoch 3/20
16/16 - 0s - loss

ValueError: Data cardinality is ambiguous:
  x sizes: 354
  y sizes: 0
Make sure all arrays contain the same number of samples.