In [1]:
import pickle

with open('../data_storage/CBOW_run_1M_min_5_PP@2024-04-26-10-27-43_con5_pl1000000_emb64_ep1-track2idx.pkl', 'rb') as f:
    data_uris = pickle.load(f)

In [2]:
import torch

# Load the model
model_data = torch.load('../data_storage/CBOW_run_1M_min_5_PP@2024-04-26-10-27-43_con5_pl1000000_emb64_ep1.pt', map_location=torch.device('cpu'))

# Access embeddings and layers
embeddings = model_data['embedding.weight']

In [3]:
# Creating a dictionary of song uri's and their corresponding embeddings
embeddings_dict = {}

for uri, idx in iter(data_uris.items()):
    embeddings_dict[uri] = embeddings[idx]


In [4]:
# Average embedding calculated to use as PAD embedding

average_embedding = torch.mean(embeddings, dim=0)
average_embedding.unsqueeze(0)

tensor([[-0.2239, -0.1767, -0.1768,  0.0902,  0.1923,  0.0818, -0.1311, -0.2306,
         -0.1818, -0.2775,  0.1100,  0.2362,  0.1705, -0.1840,  0.2217,  0.1891,
         -0.1653, -0.2227, -0.2156, -0.2622,  0.1852, -0.2316, -0.2309, -0.2677,
         -0.2711,  0.1375,  0.2533, -0.2247, -0.2740, -0.2182, -0.1831, -0.0821,
         -0.2757, -0.0676, -0.1172,  0.1671, -0.1226, -0.2171, -0.0354,  0.2321,
         -0.2044,  0.0036,  0.2123,  0.2110, -0.1658, -0.2394,  0.2912,  0.1897,
         -0.2654, -0.1867, -0.1602,  0.1672, -0.2275, -0.1471, -0.2599,  0.1432,
         -0.2142,  0.2324,  0.2473, -0.0061,  0.1438,  0.1202,  0.2522,  0.0536]])

In [None]:
# creation of dataframe where each row holds a playlist id, name, songs and artists

import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('../data_storage/spotify.db')

# Define the SQL query to retrieve playlist information
sql_query = '''
    SELECT playlist_track.pid, playlist.playlist_name, artist.artist_name, track.track_uri
    FROM playlist_track
    JOIN track ON playlist_track.track_uri = track.track_uri
    JOIN album ON track.album_uri = album.album_uri
    JOIN artist ON album.artist_uri = artist.artist_uri
    JOIN playlist ON playlist_track.pid = playlist.pid
'''

# Execute the SQL query and load results into a DataFrame
df = pd.read_sql_query(sql_query, conn)

# Group by playlist ID and aggregate artists and songs into lists
grouped = df.groupby(['pid', 'playlist_name']).agg({
    'artist_name': lambda x: list(x),
    'track_uri': lambda x: list(x)
}).reset_index()

# Rename columns for clarity
grouped.rename(columns={'artist_name': 'artists', 'track_uri': 'songs'}, inplace=True)

# Close the database connection
conn.close()

# import ast
# grouped = pd.read_csv('back_up.csv')
# grouped['artists'] = grouped['artists'].apply(ast.literal_eval)
# grouped['songs'] = grouped['songs'].apply(ast.literal_eval)



In [None]:
# Save time by saving dataframe to a CSV file.
grouped.to_csv('back_up.csv', index=False)
# read_from_csv = False

In [None]:
from sklearn.model_selection import train_test_split
import random
import ast

#first clean dataset and remove playlists with < 15 songs??????
filtered_grouped = grouped[grouped['songs'].apply(len) >= 15]

def createDatasets(filtered_grouped, k, read_from_csv, embeddings_dict, average_embedding):
    # Split data into train and test sets
    train_df, test_df = train_test_split(filtered_grouped, test_size=0.2, random_state=42)
    
    # Function to process each subset (train/test)
    def process_data(df, read_from_csv, k):
        embeddings = []  # Holds the embeddings of k songs in the playlist
        remaining_songs = []  # Holds the remaining songs in the playlist to be predicted
        
        for _, row in df.iterrows():
            p_songs = row['songs']
            if read_from_csv:
                p_songs = ast.literal_eval(p_songs)
            if len(p_songs) >= k:
                selected_songs = random.sample(p_songs, k)
                p_embeddings = [embeddings_dict.get(song, average_embedding) for song in selected_songs]
                embeddings.append(p_embeddings)
                remaining_songs.append([song for song in p_songs if song not in selected_songs])
        
        # Convert embeddings to tensors
        tensors = []
        for embedding_list in embeddings:
            if embedding_list:  # Check if list is not empty
                tensor_list = [torch.tensor(embedding) for embedding in embedding_list]
                tensors.append(torch.stack(tensor_list, dim=0))
        
        # Stack tensors to create a single tensor for each subset
        tensor = torch.stack(tensors, dim=0) if tensors else torch.tensor([])

        return tensor, remaining_songs
    
    # Process training data
    x_train, y_train = process_data(train_df, read_from_csv, k)
    
    # Process testing data
    x_test, y_test = process_data(test_df, read_from_csv, k)
    
    return x_train, y_train, x_test, y_test

In [None]:
# #Definiton of simple loss function which assesses what percentage of the remaining songs in the playlist have been predicted by the model.
# def playlist_loss(y_true, y_pred):
#     total_percentage_incorrect = 0.0
    
#     for y_predictions, y_batch_true in zip(y_pred, y_true):
#         # Calculate the number of correctly guessed songs
#         num_correctly_guessed = sum(song_id in y_predictions for song_id in y_batch_true)
        
#         # Calculate the percentage of correctly guessed songs
#         try:
#             percentage_correct = (num_correctly_guessed / len(y_batch_true)) * 100.0
#             total_percentage_incorrect += 100.0 - percentage_correct
#         except:
#             total_percentage_incorrect += 100
        
#     # Calculate the average percentage incorrect across all batches
#     average_percentage_incorrect = total_percentage_incorrect / len(y_true)
#     print(average_percentage_incorrect)
#     return average_percentage_incorrect


In [None]:
import numpy as np

def r_precision(y_true, y_pred):
    # Convert y_true to a set for quick lookup -- Probably not needed
    y_true_set = set(y_true)

    # Determine the number of relevant items -- Not 100% sure
    r = len(y_true_set)
    
    # Filter y_pred to only include items that are in y_true -- Not 100% sure if this is correct
    relevant_predictions = [song for song in y_pred if song in y_true_set]
    
    # Calculate the number of relevant items that should be considered
    r_actual = min(r, len(relevant_predictions))
    
    # Count how many of the top-r_actual predictions are in the true set of relevant songs
    relevant_count = len(relevant_predictions[:r_actual])
    
    if r == 0:
        return 0  # Avoid division by zero if there are no relevant items
    return relevant_count / r

def recommended_songs_clicks(y_true, y_pred):
    # Convert ground truth list to set for quick look-up -- Again, probably not needed
    true_set = set(y_true)
    
    # Find the first relevant track in the predictions
    for i, track in enumerate(y_pred, start=1):
        if track in true_set:
            # how many blocks of 10 tracks are needed
            return (i - 1) // 10 + 1
    
    # If no relevant track is found, set a default value
    # Since the max number of clicks possible plus one is mentioned as 51
    return 51


def dcg(relevances, rank):
    relevances = np.array(relevances)
    if relevances.size:
        return relevances[0] + np.sum(relevances[1:] / np.log2(np.arange(2, relevances.size + 1)))
    return 0

def idcg(relevances):
    sorted_relevances = sorted(relevances, reverse=True)
    return dcg(sorted_relevances, len(sorted_relevances))

def ndcg(y_true, y_pred):
    relevances = [1 if song in y_true else 0 for song in y_pred]
    actual_dcg = dcg(relevances, len(relevances))
    ideal_dcg = idcg(relevances)
    if ideal_dcg == 0:
        return 0
    return actual_dcg / ideal_dcg


# def playlist_loss(y_true, y_pred):

#     def flatten_list(lst):
#         return [item for sublist in lst for item in sublist]
    
#     y_true = flatten_list(y_true)
#     y_pred = flatten_list(y_pred)


#     r_loss = 1 - r_precision(y_true, y_pred)
#     ndcg_loss = 1 - ndcg(y_true, y_pred)
#     clicks = recommended_songs_clicks(y_true, y_pred)

#     weight_r = 1
#     weight_ndcg = 1
#     weight_clicks = 1/50 * 1

#     # -1/50 to keep loss 0 to 3, as 1 click (1/50) is the minimum
#     return (r_loss * weight_r) + (ndcg_loss * weight_ndcg) + (clicks * weight_clicks - 1/50)


def playlist_loss(y_true, y_pred):
    total_loss = 0
    num_samples = len(y_true)
    
    for i in range(num_samples):
        # Convert y_true and y_pred to sets for quick lookup
        y_true_set = set(y_true[i])
        y_pred_set = set(y_pred[i])

        # Calculate R-Precision
        r_precision_value = r_precision(y_true_set, y_pred_set)

        # Calculate NDCG
        ndcg_value = ndcg(y_true_set, y_pred_set)

        # Calculate clicks
        clicks_value = recommended_songs_clicks(y_true_set, y_pred_set)

        # Define weights
        weight_r = 1
        weight_ndcg = 1
        weight_clicks = 1/50  # Adjusted weight for clicks

        # Calculate individual losses
        r_loss = 1 - r_precision_value
        ndcg_loss = 1 - ndcg_value

        # Subtract 1/50 to keep loss within the range of 0 to 3, as 1 click (1/50) is the minimum
        clicks_loss = clicks_value * weight_clicks - 1/50

        # Combine individual losses
        loss = (r_loss * weight_r) + (ndcg_loss * weight_ndcg) + clicks_loss

        # print(f'Loss: {loss} R loss: {r_loss} NDCG: {ndcg_loss} Clicks: {clicks_value}')

        # Accumulate total loss
        total_loss += loss
    
    # Calculate average loss
    average_loss = total_loss / num_samples
    
    return average_loss


# y_true = np.array(['song1', 'song2', 'song3', 'song4', 'song5', 'song6', 'song7', 'song8'])
# y_pred = np.array(['song19', 'song24', 'song32', 'song62', 'song1', 'song73', 'song99', 'song84', 'song14', 'song13', 'song122'])

# print("R-Precision:", r_precision(y_true, y_pred))
# print("NDCG:", ndcg(y_true, y_pred))
# print("Recommended Songs Clicks:", recommended_songs_clicks(y_true, y_pred))
# print("Playlist Loss:", playlist_loss(y_true, y_pred))


In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence


## embedding length 64
## lstm taking k track embeddings

# Initialize an empty dictionary to store the embeddings of songs that appear in more than 5 playlists
filtered_embeddings_dict = {}

# # Calculate playlist counts for each song URI
playlist_counts = {}
for songs_list in filtered_grouped['songs']:
    for song_uri in songs_list:
        playlist_counts[song_uri] = playlist_counts.get(song_uri, 0) + 1


print(f'Playlist counts size: {playlist_counts}')
# Filter embeddings dictionary to include only songs that appear in more than 5 playlists
for song_uri, embedding in embeddings_dict.items():
    print(song_uri)
    if playlist_counts.get(song_uri, 0) > 50:
        filtered_embeddings_dict[song_uri] = embedding

print(len(filtered_embeddings_dict))


small_df = filtered_grouped.sample(frac=0.5)
# k = 10

# x_train, y_train, x_test, y_test = createDatasets(small_df, 10, True, embeddings_dict, average_embedding)

# def combine_datasets(k_values):
#     x_trains, y_trains, x_tests, y_tests = [], [], [], []
#     for k in k_values:
#         x_train, y_train, x_test, y_test = createDatasets(small_df, k, True, embeddings_dict, average_embedding)  # Assuming createDatasets returns the necessary datasets
#         x_trains.append(x_train)
#         y_trains.append(y_train)
#         x_tests.append(x_test)
#         y_tests.append(y_test)
    
#     # Concatenate datasets along the first dimension
#     x_train_combined = np.concatenate(x_trains, axis=0)
#     y_train_combined = np.concatenate(y_trains, axis=0)
#     x_test_combined = np.concatenate(x_tests, axis=0)
#     y_test_combined = np.concatenate(y_tests, axis=0)
    
#     return x_train_combined, y_train_combined, x_test_combined, y_test_combined

# # Example usage
# k_values = [5, 10, 20]  # Different sizes for 'k'
# x_train, y_train, x_test, y_test = combine_datasets(k_values)



In [None]:
#         x_tests.append(x_test)
#         y_tests.append(y_test)
    
#     # Concatenate datasets along the first dimension
#     x_train_combined = np.concatenate(x_trains, axis=0)
#     y_train_combined = np.concatenate(y_trains, axis=0)
#     x_test_combined = np.concatenate(x_tests, axis=0)
#     y_test_combined = np.concatenate(y_tests, axis=0)
    
#     return x_train_combined, y_train_combined, x_test_combined, y_test_combined

# # Example usage
# k_values = [5, 10, 20]  # Different sizes for 'k'
# x_train, y_train, x_test, y_test = combine_datasets(k_values)



In [None]:
class ClosestEmbeddingsLayer(nn.Module):
    def __init__(self, num_closest=500):
        super(ClosestEmbeddingsLayer, self).__init__()
        self.num_closest = num_closest

    def forward(self, generated_embeddings):
        closest_embeddings_batch = []
        for generated_embedding in generated_embeddings:
            # Compute cosine similarity between the generated embedding and all embeddings in embeddings_dict
            similarities = {}
            # for key, value in embeddings_dict.items():
            for key, value in embeddings_dict.items():
                similarity = torch.nn.functional.cosine_similarity(generated_embedding.unsqueeze(0), value.unsqueeze(0))
                similarities[key] = similarity.item()
            
            # Sort the similarities and get the top num_closest embeddings
            closest_embeddings = sorted(similarities, key=similarities.get, reverse=True)[:self.num_closest]
            closest_embeddings_batch.append(closest_embeddings)
        
        return closest_embeddings_batch

class DynamicRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DynamicRNN, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.closest_embeddings_layer = ClosestEmbeddingsLayer()

    def forward(self, x, lengths):
        # Packing the padded sequence
        packed_input = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_input)
        # Unpacking the sequence
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        # Applying the linear layer and closest embeddings layer on the last hidden state of the outputs
        output = self.fc(output[range(len(output)), lengths - 1])  # Select the output at the last time step of each sequence
        closest_embeddings = self.closest_embeddings_layer(output)
        return closest_embeddings

In [None]:
def train_model_for_k(x_train, y_train, x_test, y_test, epochs, batch_size, k):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = DynamicRNN(input_size=64, hidden_size=128, output_size=64).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    # criterion = playlist_loss()  # Or any other appropriate loss function
    # closest_embeddings_layer = ClosestEmbeddingsLayer().to(device)

    # Convert datasets to tensors and move to the appropriate device
    x_train = torch.tensor(x_train, dtype=torch.float).to(device)
    # y_train = torch.tensor(y_train, dtype=torch.float).to(device)
    x_test = torch.tensor(x_test, dtype=torch.float).to(device)
    # y_test = torch.tensor(y_test, dtype=torch.float).to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for i in range(0, len(x_train), batch_size):
            # inputs = x_train[i:i+batch_size]
            # targets = y_train[i:i+batch_size]
            # optimizer.zero_grad()
            # outputs = model(inputs)
            # closest_embeddings = closest_embeddings_layer(outputs)
            # loss = criterion(outputs, closest_embeddings)
            # loss_tensor = torch.tensor(loss, requires_grad=True, device=device)
            # loss_tensor.backward()
            # optimizer.step()
            # total_loss += loss.item()

            # print(f'Batch: {i}')
            optimizer.zero_grad()
            batch_input = x_train[i:i+batch_size].to(device)
            batch_target = y_train[i:i+batch_size]

            lengths = torch.tensor([len(x) for x in batch_input], dtype=torch.int64)
            output = model(batch_input, lengths)
            # closest_embeddings = closest_embeddings_layer(output)
            #returns list of 500 closest embeddings
            loss = playlist_loss(batch_target, output)
            print(f'Loss: {loss}')
            loss_tensor = torch.tensor(loss, requires_grad=True, device=device)  # Convert loss to a tensor and move to CUDA device
            loss_tensor.backward()  # Compute gradients
            optimizer.step()  # Update weights
        
        print(f'Epoch {epoch+1}, Loss: {total_loss / (len(x_train) / batch_size)}')
    
    evaluate_model(model, x_test, y_test, batch_size)

def evaluate_model(model, x_test, y_test, batch_size):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for i in range(0, len(x_test), batch_size):
            # inputs = x_test[i:i+batch_size]
            # targets = y_test[i:i+batch_size]
            # outputs = model(inputs)

            batch_input = x_test[i:i+batch_size].to(device)
            batch_target = y_test[i:i+batch_size]
            lengths = torch.tensor([len(x) for x in batch_input], dtype=torch.int64)
            val_output = model(batch_input, lengths)
            # val_closest_embeddings = closest_embeddings_layer(val_output)
            val_loss = playlist_loss(batch_target, val_output)
            total_loss += val_loss

    print(f'Test Loss: {total_loss / (len(x_test) / batch_size)}')






# with torch.no_grad():
#         for i in range(0, len(testing_tensor), batch_size):
#             batch_input = testing_tensor[i:i+batch_size].to(device)
#             batch_target = test_remaining_songs[i:i+batch_size]
#             val_output = model(batch_input)
#             val_closest_embeddings = closest_embeddings_layer(val_output)
#             val_loss = playlist_loss(batch_target, val_closest_embeddings)
#             val_loss_total += val_loss.item()

# Example usage for each k
print(f'Size of filtered dict: {len(embeddings_dict)}')

k_values = [5, 10, 20]
for k in k_values:
    x_train, y_train, x_test, y_test = createDatasets(small_df, k, True, embeddings_dict, average_embedding)
    train_model_for_k(x_train, y_train, x_test, y_test, epochs=10, batch_size=32, k=k)

In [None]:
# class PlaylistLSTMModel(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers):
#         super(PlaylistLSTMModel, self).__init__()
#         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, input_size)  # Output size is input_size to predict embeddings

#     def forward(self, x, lengths):
#         # Pack the sequence
#         packed_input = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
#         packed_output, (hn, cn) = self.lstm(packed_input)
#         # Decode the hidden state of the last time step
#         decoded = self.fc(hn[-1])
#         return decoded
    
# # Custom layer to find closest embeddings
# class ClosestEmbeddingsLayer(nn.Module):
#     def __init__(self, num_closest=500):
#         super(ClosestEmbeddingsLayer, self).__init__()
#         self.num_closest = num_closest

#     def forward(self, generated_embeddings):
#         closest_embeddings_batch = []
#         for generated_embedding in generated_embeddings:
#             # Compute cosine similarity between the generated embedding and all embeddings in embeddings_dict
#             similarities = {}
#             for key, value in embeddings_dict.items():
#                 similarity = torch.nn.functional.cosine_similarity(generated_embedding.unsqueeze(0), value.unsqueeze(0))
#                 similarities[key] = similarity.item()
            
#             # Sort the similarities and get the top num_closest embeddings
#             closest_embeddings = sorted(similarities, key=similarities.get, reverse=True)[:self.num_closest]
#             closest_embeddings_batch.append(closest_embeddings)
        
#         return closest_embeddings_batch


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# input_size = train_set.shape[1]  #

# model = PlaylistLSTMModel(input_size).to(device)
# closest_embeddings_layer = ClosestEmbeddingsLayer().to(device)

# optimizer = optim.Adam(model.parameters())

# # Training loop
# num_epochs = 10
# batch_size = 32
# for epoch in range(num_epochs):
#     model.train()
#     for i in range(0, len(train_set), batch_size):
#         optimizer.zero_grad()
#         batch_input = training_tensor[i:i+batch_size].to(device)
#         batch_target = remaining_songs[i:i+batch_size]
#         output = model(batch_input)
#         closest_embeddings = closest_embeddings_layer(output)
#         #returns list of 500 closest embeddings
#         loss = playlist_loss(batch_target, closest_embeddings)
#         loss_tensor = torch.tensor(loss, requires_grad=True, device=device)  # Convert loss to a tensor and move to CUDA device
#         loss_tensor.backward()  # Compute gradients
#         optimizer.step()  # Update weights

#     # Validation
#     model.eval()
#     val_loss_total = 0.0  # Variable to accumulate the total validation loss

#     with torch.no_grad():
#         for i in range(0, len(testing_tensor), batch_size):
#             batch_input = testing_tensor[i:i+batch_size].to(device)
#             batch_target = test_remaining_songs[i:i+batch_size]
#             val_output = model(batch_input)
#             val_closest_embeddings = closest_embeddings_layer(val_output)
#             val_loss = playlist_loss(batch_target, val_closest_embeddings)
#             val_loss_total += val_loss.item()

#     # Calculate the average validation loss across all batches
#     average_val_loss = val_loss_total / (len(testing_tensor) / batch_size)

#     print(f"Epoch {epoch+1}, Validation Loss: {average_val_loss:.4f}")
