Relevant imports and initialization code.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import random
from numpy import savetxt

seed = 25
np.random.seed(seed)

This section reads in our data from our Spotify track dataset. This dataset has a large amount of data that's captured per track.

Let's read in our other dataset that contains user created playlist with songs with them. This will be crucial to generating a desired y label that we will want our neural network to train on.

You'll need to download the spotify song dataset and the spotify recommendator challenge dataset from https://www.kaggle.com/datasets/zaheenhamidani/ultimate-spotify-tracks-db/ and https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge respectively and put them into the dataset folder in order to run this code.

It's not in the git repo since it's too large.

In [2]:
sample_size = 4000
original_df = pd.read_csv('./datasets/spotify_songs.csv')
original_df.drop_duplicates('track_id', inplace=True)
original_df = original_df[['genre','artist_name','track_name','track_id','popularity','acousticness','danceability','duration_ms','energy','instrumentalness','key','liveness','loudness','mode','speechiness','tempo','time_signature','valence']]
# we only want a certain number of columns. Reducing unneeded features will improve performance.


playlist_df = pd.read_json('./datasets/challenge_set.json')
playlist_series = playlist_df['playlists']

Note that the code before does not output any information about the genre or track id. This is because in the dataset they are string values.

This code section rearranges the dataset to be more compatible with machine learning. One aspect of this is converting unique string values into a numeric equivalent. The Tensorflow normalization layer will handle proper normalization after that.

In [4]:
def encode_values(col):
    unique_items = col.unique().tolist()
    items_to_encoded = {x: i for i, x in enumerate(unique_items)}
    encoded_to_items = {i: x for i, x in enumerate(unique_items)}
    return (items_to_encoded, encoded_to_items)

genre_items_to_encoded, genre_encoded_to_items = encode_values(original_df['genre'])
original_df['genre'] = original_df['genre'].map(genre_items_to_encoded)

artist_name_items_to_encoded, artist_name_encoded_to_items = encode_values(original_df['artist_name'])
original_df['artist_name'] = original_df['artist_name'].map(artist_name_items_to_encoded)

track_name_items_to_encoded, track_name_encoded_to_items = encode_values(original_df['track_name'])
original_df['track_name'] = original_df['track_name'].map(track_name_items_to_encoded)

track_id_items_to_encoded, track_id_encoded_to_items = encode_values(original_df['track_id'])
original_df['track_id'] = original_df['track_id'].map(track_id_items_to_encoded)

key_items_to_encoded, key_encoded_to_items = encode_values(original_df['key'])
original_df['key'] = original_df['key'].map(key_items_to_encoded)

mode_items_to_encoded, mode_encoded_to_items = encode_values(original_df['mode'])
original_df['mode'] = original_df['mode'].map(mode_items_to_encoded)

time_signature_items_to_encoded, time_signature_encoded_to_items = encode_values(original_df['time_signature'])
original_df['time_signature'] = original_df['time_signature'].map(time_signature_items_to_encoded)

df = original_df.copy()
df = df[sample_size:sample_size*2]
df.describe()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,3.44975,683.86075,5574.52625,5999.5,46.17125,0.221302,0.558485,225759.4,0.671749,0.033769,5.42125,0.197724,-6.972861,0.249,0.072259,123.099086,0.1465,0.487848
std,0.497531,441.645122,1342.866369,1154.844867,7.734816,0.264964,0.134234,60624.48,0.211033,0.131651,3.509113,0.161859,3.070843,0.432488,0.079433,30.358727,0.524029,0.225759
min,3.0,40.0,7.0,4000.0,0.0,3e-06,0.127,26717.0,0.0188,0.0,0.0,0.0214,-25.669,0.0,0.0224,47.811,0.0,0.0342
25%,3.0,323.75,4680.75,4999.75,39.0,0.010675,0.472,191458.2,0.515,0.0,2.0,0.0993,-8.506,0.0,0.0318,97.9905,0.0,0.315
50%,3.0,534.0,5653.5,5999.5,48.0,0.0934,0.5585,217006.5,0.7025,1e-05,5.0,0.132,-6.347,0.0,0.0413,120.1305,0.0,0.4765
75%,4.0,1018.0,6639.25,6999.25,52.0,0.376,0.649,249916.8,0.848,0.000937,8.0,0.254,-4.74425,0.0,0.0724,144.051,0.0,0.66025
max,4.0,1696.0,7634.0,7999.0,72.0,0.974,0.936,1355938.0,0.998,0.949,11.0,0.994,-0.259,1.0,0.918,213.788,3.0,0.975


Here we build a pseudo histogram where we capture the frequency in which a particular song is in a playlist with another song. In training, this will generate the y label based on how frequent a song is found with another one.

In [5]:
train_indices = int(0.9 * df.shape[0])
train_df = df.iloc[:train_indices]
count = 0

# encoded_track_histogram = {int(row['track_id']): set() for index, row in df.iterrows()}
# for playlist in playlist_series:
#     playlist_tracks = [track['track_uri'].split(':')[2] for track in playlist['tracks']]
#     included_tracks = [track for track in playlist_tracks if track in track_id_items_to_encoded]
#     encoded_tracks = [track_id_items_to_encoded[track] for track in included_tracks]
    
#     if (len(encoded_tracks) > 1):
#         for x in encoded_tracks:
#             for y in encoded_tracks:
#                 encoded_track_histogram[x].add(y)
#                 encoded_track_histogram[y].add(x)
#                 count = count + 2

x_data = df.values.astype(np.float32)
y_data = []

normalized_df = (df - df.mean()) / df.std()
normalized_train_df = normalized_df.iloc[:train_indices]

print(len(list(df.iterrows())))
print(df.to_numpy().size)
print(normalized_df.to_numpy().size)
#print(len(encoded_track_histogram))

for index, row in normalized_df.iterrows():
    probability = np.zeros(train_indices)
    distances = {id: np.sqrt(np.sum((np.array(row) - np.array(other_row)) ** 2)) for id, other_row in enumerate(normalized_train_df.to_numpy())}
    sorted_distances = sorted(distances.items(), key=lambda x: x[1])
    closest = sorted_distances[1:6]

    histogram_data = [datapoint[0] for datapoint in closest] #+ [datapoint for datapoint in encoded_track_histogram[index] if datapoint < len(probability)]

    probability[histogram_data[0]] = 0.5
    probability[histogram_data[1]] = 0.3
    probability[histogram_data[2]] = 0.1
    probability[histogram_data[3]] = 0.08
    probability[histogram_data[4]] = 0.02
    # for histogram_datapoint in histogram_data:
    #     probability[histogram_datapoint] = probability[histogram_datapoint] + (1 / len(histogram_data))

    y_data.append(probability)



y_data = np.array(y_data)
test = y_data.sum(axis=1)
savetxt('test.txt', test)
print(count)

x_train = x_data[:train_indices]
x_test = x_data[train_indices:]
y_train = np.array(y_data)[:train_indices]
y_test = np.array(y_data)[train_indices:]

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


4000
72000
72000
0
(3600, 18)
(400, 18)
(3600, 3600)
(400, 3600)


This section builds the model.

In [9]:
def build_and_compile_model(features, num_classes):
    normal_layer = tf.keras.layers.Normalization(axis=-1)
    normal_layer.adapt(features)

    model = keras.Sequential([
      normal_layer,
      keras.layers.Dense(2000, activation='relu'),
      keras.layers.Dense(2000, activation='relu'),
      keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [10]:

model = build_and_compile_model(x_train, len(x_train))
model.summary()

model.fit(x_train, y_train, epochs=50, batch_size=500, verbose=2) #  validation_data=(x_test, y_test)
scores = model.evaluate(x_test, y_test)
print('Baseline error: %2.f%%' % (100 - scores[1] * 100))



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_1 (Normaliza  (None, 18)                37        
 tion)                                                           
                                                                 
 dense_3 (Dense)             (None, 2000)              38000     
                                                                 
 dense_4 (Dense)             (None, 2000)              4002000   
                                                                 
 dense_5 (Dense)             (None, 3600)              7203600   
                                                                 
Total params: 11243637 (42.89 MB)
Trainable params: 11243600 (42.89 MB)
Non-trainable params: 37 (152.00 Byte)
_________________________________________________________________
Epoch 1/50
8/8 - 1s - loss: 8.0366 - accuracy: 0.0119 - 1s/epoch - 144ms/st

In [120]:
def print_row_details(data):
    print(genre_encoded_to_items[data['genre']])
    print(artist_name_encoded_to_items[data['artist_name']])
    print(track_name_encoded_to_items[data['track_name']])
    print(track_id_encoded_to_items[data['track_id']])
    print(key_encoded_to_items[data['key']])
    print(mode_encoded_to_items[data['mode']])

index = random.randrange(len(df))
track_id = '1EYl3ASOxlK4Fk4Q1bhDh4'
#data_to_predict = original_df.iloc[index]
data_to_predict = original_df.iloc[track_id_items_to_encoded[track_id]]

print('Selected data point')
print_row_details(data_to_predict)

prediction = model.predict(data_to_predict.to_numpy().astype(np.float64))

# print(prediction.shape)
# with np.printoptions(threshold=np.inf):
#     print(prediction)
# savetxt('data.npy', prediction[0])

for row in prediction:
    prediction_index = row.argmax()
    print(row[prediction_index])
    print('Prediction:')
    print_row_details(df.iloc[prediction_index])
    print('')


Selected data point
Country
Johnny Cash
The One on the Right Is on the Left
1EYl3ASOxlK4Fk4Q1bhDh4
A
Major
0.18193291
Prediction:
Country
The Doobie Brothers
Another Park, Another Sunday - 2006 Remaster
5dEOntLHunr3jYzS1XBNmk
A
Major

