In [1]:
import pickle

with open('../data_storage/CBOW_run_100k_min_100_withPreproc@2024-04-25-12-41-01_con5_pl100000_emb32_ep1-track2idx.pkl', 'rb') as f:
    data_uris = pickle.load(f)

In [2]:
import torch

# Load the model
model_data = torch.load('../data_storage/CBOW_run_100k_min_100_withPreproc@2024-04-25-12-41-01_con5_pl100000_emb32_ep1.pt', map_location=torch.device('cpu'))

# Access embeddings and layers
embeddings = model_data['embedding.weight']

In [3]:
# Creating a dictionary of song uri's and their corresponding embeddings
embeddings_dict = {}

for uri, idx in iter(data_uris.items()):
    embeddings_dict[uri] = embeddings[idx]

In [4]:
# Average embedding calculated to use as PAD embedding

average_embedding = torch.mean(embeddings, dim=0)
average_embedding.unsqueeze(0)

tensor([[ 0.0057, -0.0095,  0.0026,  0.0261,  0.0150, -0.0255, -0.0067,  0.0178,
         -0.0108,  0.0072,  0.0199,  0.0076,  0.0339,  0.0441,  0.0301,  0.0022,
          0.0079,  0.0048,  0.0351, -0.0245, -0.0259, -0.0175, -0.0053,  0.0220,
          0.0117, -0.0124, -0.0049, -0.0146, -0.0169,  0.0167,  0.0250, -0.0023]])

In [5]:
# creation of dataframe where each row holds a playlist id, name, songs and artists

import sqlite3
import pandas as pd
"""
# Connect to the SQLite database
conn = sqlite3.connect('../data_storage/spotify.db')

# Define the SQL query to retrieve playlist information
sql_query = '''
    SELECT playlist_track.pid, playlist.playlist_name, artist.artist_name, track.track_uri
    FROM playlist_track
    JOIN track ON playlist_track.track_uri = track.track_uri
    JOIN album ON track.album_uri = album.album_uri
    JOIN artist ON album.artist_uri = artist.artist_uri
    JOIN playlist ON playlist_track.pid = playlist.pid
'''

# Execute the SQL query and load results into a DataFrame
df = pd.read_sql_query(sql_query, conn)

# Group by playlist ID and aggregate artists and songs into lists
grouped = df.groupby(['pid', 'playlist_name']).agg({
    'artist_name': lambda x: list(x),
    'track_uri': lambda x: list(x)
}).reset_index()

# Rename columns for clarity
grouped.rename(columns={'artist_name': 'artists', 'track_uri': 'songs'}, inplace=True)

# Close the database connection
conn.close()"""
grouped = pd.read_csv('back_up.csv')

# Save time by saving dataframe to a CSV file.
#grouped.to_csv('back_up.csv', index=False)

In [6]:
from sklearn.model_selection import train_test_split
import random
import ast

#first clean dataset and remove playlists with < 15 songs??????
filtered_grouped = grouped[grouped['songs'].apply(len) >= 15]

train_df, test_df = train_test_split(filtered_grouped, test_size=0.2, random_state=42)

# prepare inputs for model
k = 10
training_embeddings = []     # Holds the embeddings of k songs in the playlist
remaining_songs = []         # Holds the remaining songs in the playlist that are to be predicted
i = 1
for _, row in train_df.iterrows():
    p_songs = row['songs']                            # WHEN READING FROM CSV LISTS BECOME STRINGS - THIS CONVERTS BACK TO LIST
    p_songs = ast.literal_eval(p_songs)
    if len(p_songs) >= k:
        selected_songs = random.sample(p_songs, k)
        p_embeddings = [embeddings_dict.get(song, average_embedding) for song in selected_songs]
        training_embeddings.append(p_embeddings)
        remaining_songs.append([song for song in p_songs if song not in selected_songs])

testing_embeddings = []     # Holds the embeddings of k songs in the playlist
test_remaining_songs = []         # Holds the remaining songs in the playlist that are to be predicted
i = 1
for _, row in test_df.iterrows():
    p_songs = row['songs']
    p_songs = ast.literal_eval(p_songs)              # WHEN READING FROM CSV LISTS BECOME STRINGS - THIS CONVERTS BACK TO LIST
    if len(p_songs) >= k:
        selected_songs = random.sample(p_songs, k)
        p_embeddings = [embeddings_dict.get(song, average_embedding) for song in selected_songs]
        testing_embeddings.append(p_embeddings)
        test_remaining_songs.append([song for song in p_songs if song not in selected_songs])


training_tensors = []
testing_tensors = []

for embedding_list in training_embeddings:
    # Check if embedding_list is empty
    if embedding_list:
        # Convert each embedding to a PyTorch tensor
        tensors = [torch.tensor(embedding) for embedding in embedding_list]
        training_tensors.append(torch.stack(tensors, dim=0))

for embedding_list in testing_embeddings:
    # Check if embedding_list is empty
    if embedding_list:
        # Convert each embedding to a PyTorch tensor
        tensors = [torch.tensor(embedding) for embedding in embedding_list]
        testing_tensors.append(torch.stack(tensors, dim=0))

# Stack PyTorch tensors along a new dimension to create a single tensor
training_tensor = torch.stack(training_tensors, dim=0)
testing_tensor = torch.stack(testing_tensors, dim=0)

  tensors = [torch.tensor(embedding) for embedding in embedding_list]
  tensors = [torch.tensor(embedding) for embedding in embedding_list]


In [7]:
import torch

def playlist_loss(y_true, y_pred, embeddings_dict):
    total_percentage_incorrect = 0.0
    
    top_500_songs = []  

    for y_predictions, y_batch_true in zip(y_pred, y_true):
        top_500_indices = sorted(range(len(y_predictions)), key=lambda i: y_predictions[i], reverse=True)[:500]
        
        top_500_songs_batch = [list(embeddings_dict.keys())[i] for i in top_500_indices]
        
        num_correctly_guessed = sum(song_id in top_500_songs_batch for song_id in y_batch_true)
        
        try:
            percentage_correct = (num_correctly_guessed / len(y_batch_true)) * 100.0
            total_percentage_incorrect += 100.0 - percentage_correct
        except ZeroDivisionError:
            total_percentage_incorrect += 100
        
        top_500_songs.append(top_500_songs_batch)
    
    average_percentage_incorrect = torch.tensor(total_percentage_incorrect / len(y_true))
    print(average_percentage_incorrect)
    
    top_500_songs_tensor = torch.tensor([song for batch_songs in top_500_songs for song in batch_songs])
    
    return average_percentage_incorrect, top_500_songs_tensor




In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class MultiLabelClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MultiLabelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Reshape to (batch_size, num_songs * song_embedding_length)
        
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x
input_size = training_tensor.shape[1] * training_tensor.shape[2]  
num_classes = len(embeddings_dict)
learning_rate = 0.001
batch_size = 64
epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLabelClassifier(input_size, num_classes).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

training_tensor = training_tensor.to(device)
testing_tensor = testing_tensor.to(device)

for epoch in range(epochs):
    model.train()
    for i in range(0, len(training_tensor), batch_size):
        optimizer.zero_grad()
        batch_input = training_tensor[i:i+batch_size]
        batch_target = remaining_songs[i:i+batch_size]
        output = model(batch_input)
        loss = playlist_loss(batch_target, output, embeddings_dict)
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights

    # Validation
    model.eval()
    val_loss_total = 0.0  

    with torch.no_grad():
        for i in range(0, len(testing_tensor), batch_size):
            batch_input = testing_tensor[i:i+batch_size]
            batch_target = test_remaining_songs[i:i+batch_size]
            val_output = model(batch_input)
            val_loss = playlist_loss(batch_target, val_output, embeddings_dict)
            val_loss_total += val_loss.item()
    average_val_loss = val_loss_total / (len(testing_tensor) / batch_size)

    print(f"Epoch {epoch+1}, Validation Loss: {average_val_loss:.4f}" )

