Loading up the song embeddings.

In [6]:
import pickle

with open('../data_storage/CBOW_run_100k_min_100_withPreproc@2024-04-25-12-41-01_con5_pl100000_emb32_ep1-track2idx.pkl', 'rb') as f:
    data_uris = pickle.load(f)

In [7]:
import torch

# Load the model
model_data = torch.load('../data_storage/CBOW_run_100k_min_100_withPreproc@2024-04-25-12-41-01_con5_pl100000_emb32_ep1.pt', map_location=torch.device('cpu'))

# Access embeddings and layers
embeddings = model_data['embedding.weight']

In [8]:
# Creating a dictionary of song uri's and their corresponding embeddings
embeddings_dict = {}

for uri, idx in iter(data_uris.items()):
    embeddings_dict[uri] = embeddings[idx]


In [9]:
# Average embedding calculated to use as PAD embedding

average_embedding = torch.mean(embeddings, dim=0)
average_embedding.unsqueeze(0)

tensor([[ 0.0057, -0.0095,  0.0026,  0.0261,  0.0150, -0.0255, -0.0067,  0.0178,
         -0.0108,  0.0072,  0.0199,  0.0076,  0.0339,  0.0441,  0.0301,  0.0022,
          0.0079,  0.0048,  0.0351, -0.0245, -0.0259, -0.0175, -0.0053,  0.0220,
          0.0117, -0.0124, -0.0049, -0.0146, -0.0169,  0.0167,  0.0250, -0.0023]])

In [15]:
# creation of dataframe where each row holds a playlist id, name, songs and artists

import sqlite3
import pandas as pd

# Connect to the SQLite database
"""conn = sqlite3.connect('../data_storage/spotify.db')

# Define the SQL query to retrieve playlist information
sql_query = '''
    SELECT playlist_track.pid, playlist.playlist_name, artist.artist_name, track.track_uri
    FROM playlist_track
    JOIN track ON playlist_track.track_uri = track.track_uri
    JOIN album ON track.album_uri = album.album_uri
    JOIN artist ON album.artist_uri = artist.artist_uri
    JOIN playlist ON playlist_track.pid = playlist.pid
'''

# Execute the SQL query and load results into a DataFrame
df = pd.read_sql_query(sql_query, conn)

# Group by playlist ID and aggregate artists and songs into lists
grouped = df.groupby(['pid', 'playlist_name']).agg({
    'artist_name': lambda x: list(x),
    'track_uri': lambda x: list(x)
}).reset_index()

# Rename columns for clarity
grouped.rename(columns={'artist_name': 'artists', 'track_uri': 'songs'}, inplace=True)

# Close the database connection
conn.close()"""
grouped = pd.read_csv('back_up.csv')
read_from_csv = True

In [16]:
# Save time by saving dataframe to a CSV file.
'''grouped.to_csv('back_up.csv', index=False)
read_from_csv = False'''

In [17]:
from sklearn.model_selection import train_test_split
import random
import ast

#first clean dataset and remove playlists with < 15 songs??????
filtered_grouped = grouped[grouped['songs'].apply(len) >= 15]

train_df, test_df = train_test_split(filtered_grouped, test_size=0.2, random_state=42)

# prepare inputs for model
k = 10
training_embeddings = []     # Holds the embeddings of k songs in the playlist
remaining_songs = []         # Holds the remaining songs in the playlist that are to be predicted
i = 1
for _, row in train_df.iterrows():
    p_songs = row['songs']                            # WHEN READING FROM CSV LISTS BECOME STRINGS - THIS CONVERTS BACK TO LIST
    if read_from_csv:
        p_songs = ast.literal_eval(p_songs)
    if len(p_songs) >= k:
        selected_songs = random.sample(p_songs, k)
        p_embeddings = [embeddings_dict.get(song, average_embedding) for song in selected_songs]
        training_embeddings.append(p_embeddings)
        remaining_songs.append([song for song in p_songs if song not in selected_songs])

testing_embeddings = []     # Holds the embeddings of k songs in the playlist
test_remaining_songs = []         # Holds the remaining songs in the playlist that are to be predicted
i = 1
for _, row in test_df.iterrows():
    p_songs = row['songs']                          # WHEN READING FROM CSV LISTS BECOME STRINGS - THIS CONVERTS BACK TO LIST
    if read_from_csv:
        p_songs = ast.literal_eval(p_songs)
    if len(p_songs) >= k:
        selected_songs = random.sample(p_songs, k)
        p_embeddings = [embeddings_dict.get(song, average_embedding) for song in selected_songs]
        testing_embeddings.append(p_embeddings)
        test_remaining_songs.append([song for song in p_songs if song not in selected_songs])


training_tensors = []
testing_tensors = []

for embedding_list in training_embeddings:
    # Check if embedding_list is empty
    if embedding_list:
        # Convert each embedding to a PyTorch tensor
        tensors = [torch.tensor(embedding) for embedding in embedding_list]
        training_tensors.append(torch.stack(tensors, dim=0))

for embedding_list in testing_embeddings:
    # Check if embedding_list is empty
    if embedding_list:
        # Convert each embedding to a PyTorch tensor
        tensors = [torch.tensor(embedding) for embedding in embedding_list]
        testing_tensors.append(torch.stack(tensors, dim=0))

# Stack PyTorch tensors along a new dimension to create a single tensor
training_tensor = torch.stack(training_tensors, dim=0)
testing_tensor = torch.stack(testing_tensors, dim=0)

  tensors = [torch.tensor(embedding) for embedding in embedding_list]
  tensors = [torch.tensor(embedding) for embedding in embedding_list]


In [18]:
#Definiton of simple loss function which assesses what percentage of the remaining songs in the playlist have been predicted by the model.
def playlist_loss(y_true, y_pred):
    total_percentage_incorrect = 0.0
    
    for y_predictions, y_batch_true in zip(y_pred, y_true):
        # Calculate the number of correctly guessed songs
        num_correctly_guessed = sum(song_id in y_predictions for song_id in y_batch_true)
        
        # Calculate the percentage of correctly guessed songs
        try:
            percentage_correct = (num_correctly_guessed / len(y_batch_true)) * 100.0
            total_percentage_incorrect += 100.0 - percentage_correct
        except:
            total_percentage_incorrect += 100
        
    # Calculate the average percentage incorrect across all batches
    average_percentage_incorrect = total_percentage_incorrect / len(y_true)
    print(average_percentage_incorrect)
    return average_percentage_incorrect


In [19]:
import torch
import torch.nn as nn
import torch.optim as optim

# Move model and tensors to CUDA device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define your model using PyTorch
class CustomModel(nn.Module):
    def __init__(self, input_size):
        super(CustomModel, self).__init__()
        self.input_size = input_size
        self.dense1 = nn.Linear(input_size * 32, 16)
        self.dense2 = nn.Linear(16,32)

    def forward(self, x):
        # Reshape input tensor to have shape (batch_size, input_size * 32)
        x = x.view(-1, self.input_size * 32)
        
        x = torch.relu(self.dense1(x))
        
        x = torch.relu(self.dense2(x))
        return x

# Custom layer to find closest embeddings
class ClosestEmbeddingsLayer(nn.Module):
    def __init__(self, num_closest=500):
        super(ClosestEmbeddingsLayer, self).__init__()
        self.num_closest = num_closest

    def forward(self, generated_embeddings):
        closest_embeddings_batch = []
        for generated_embedding in generated_embeddings:
            # Compute cosine similarity between the generated embedding and all embeddings in embeddings_dict
            similarities = {}
            for key, value in embeddings_dict.items():
                similarity = torch.nn.functional.cosine_similarity(generated_embedding.unsqueeze(0), value.unsqueeze(0))
                similarities[key] = similarity.item()
            
            # Sort the similarities and get the top num_closest embeddings
            closest_embeddings = sorted(similarities, key=similarities.get, reverse=True)[:self.num_closest]
            closest_embeddings_batch.append(closest_embeddings)
        
        return closest_embeddings_batch

input_size = training_tensor.shape[1]  #

model = CustomModel(input_size).to(device)
closest_embeddings_layer = ClosestEmbeddingsLayer().to(device)

optimizer = optim.Adam(model.parameters())

# Training loop
num_epochs = 10
batch_size = 32
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(training_tensor), batch_size):
        optimizer.zero_grad()
        batch_input = training_tensor[i:i+batch_size].to(device)
        batch_target = remaining_songs[i:i+batch_size]
        output = model(batch_input)
        closest_embeddings = closest_embeddings_layer(output)
        #returns list of 500 closest embeddings
        loss = playlist_loss(batch_target, closest_embeddings)
        loss_tensor = torch.tensor(loss, requires_grad=True, device=device)  # Convert loss to a tensor and move to CUDA device
        loss_tensor.backward()  # Compute gradients
        optimizer.step()  # Update weights

    # Validation
    model.eval()
    val_loss_total = 0.0  # Variable to accumulate the total validation loss

    with torch.no_grad():
        for i in range(0, len(testing_tensor), batch_size):
            batch_input = testing_tensor[i:i+batch_size].to(device)
            batch_target = test_remaining_songs[i:i+batch_size]
            val_output = model(batch_input)
            val_closest_embeddings = closest_embeddings_layer(val_output)
            val_loss = playlist_loss(batch_target, val_closest_embeddings)
            val_loss_total += val_loss.item()

    # Calculate the average validation loss across all batches
    average_val_loss = val_loss_total / (len(testing_tensor) / batch_size)

    print(f"Epoch {epoch+1}, Validation Loss: {average_val_loss:.4f}")





98.91691336970675
99.59190991718191
99.27313675257705
99.66242501890993
99.49548982741388
99.29772279554473
99.54930429798225
99.58778247219328
99.57139517345118
99.37196950448819
99.68163635893302
99.25244587654528
99.55323889074134
99.2186097894653
99.27109486928202
99.59060170581077
99.6265203730407
99.57026510607616
99.87613998310515
98.96104610093887
99.72011744151735
99.72640458848416
99.36260859400232
99.29172555409691
99.76947721048268
99.24738634795335
99.32651368785888
99.63052012123391
99.08288310505375
99.00617950020833
99.41485131875534
99.73160505715475
99.66367380261347
99.34220186728433
99.4571485246989
99.50489378893583
96.5231257272718
99.72044604810968
99.4368614625037
99.66232106541764
99.22544491395571
99.57739310258759
99.5004287205154
99.55903121022385
99.84868336791413
99.84457211986242
99.39567554415176
99.6884963944971
99.29071898269778
99.32969375931856
99.28813139045675
99.3823929611189
99.59741470056427
99.61944395477445
99.48667476176124
98.67218265974383


Instead try having n neurons in final layer where each neuron is a song which has had an embedding generated for it??