In [None]:
import torch

# To double check cuda is available
print(torch.__version__)
print(torch.cuda.is_available())

In [None]:
import pickle

with open('../data_storage/CBOW_run_1M_min_5_PP@2024-04-26-10-27-43_con5_pl1000000_emb64_ep1-track2idx.pkl', 'rb') as f:
    data_uris = pickle.load(f)

In [None]:
import torch

model_data = torch.load('../data_storage/CBOW_run_1M_min_5_PP@2024-04-26-10-27-43_con5_pl1000000_emb64_ep1.pt', map_location=torch.device('cpu'))

# Access embeddings and layers
embeddings = model_data['embedding.weight']

In [None]:
# Creating a dictionary of song uri's and their corresponding embeddings
embeddings_dict = {}

for uri, idx in iter(data_uris.items()):
    embeddings_dict[uri] = embeddings[idx]


In [None]:
# Average embedding calculated to use as PAD embedding

average_embedding = torch.mean(embeddings, dim=0)
average_embedding.unsqueeze(0)

In [None]:
# creation of dataframe where each row holds a playlist id, name, songs and artists

import sqlite3
import pandas as pd

# # Connect to the SQLite database
# conn = sqlite3.connect('../data_storage/spotify.db')

# # Define the SQL query to retrieve playlist information
# sql_query = '''
#     SELECT playlist_track.pid, playlist.playlist_name, artist.artist_name, track.track_uri
#     FROM playlist_track
#     JOIN track ON playlist_track.track_uri = track.track_uri
#     JOIN album ON track.album_uri = album.album_uri
#     JOIN artist ON album.artist_uri = artist.artist_uri
#     JOIN playlist ON playlist_track.pid = playlist.pid
# '''

# # Execute the SQL query and load results into a DataFrame
# df = pd.read_sql_query(sql_query, conn)

# # Group by playlist ID and aggregate artists and songs into lists
# grouped = df.groupby(['pid', 'playlist_name']).agg({
#     'artist_name': lambda x: list(x),
#     'track_uri': lambda x: list(x)
# }).reset_index()

# # Rename columns for clarity
# grouped.rename(columns={'artist_name': 'artists', 'track_uri': 'songs'}, inplace=True)

# # Close the database connection
# conn.close()

# grouped.to_csv('back_up.csv', index=False)

import ast

# Alternative approach to speed up loading of data
df = pd.read_csv('back_up.csv')

# Ensure the data is read into the correct format
df['artists'] = df['artists'].apply(ast.literal_eval)
df['songs'] = df['songs'].apply(ast.literal_eval)


In [None]:
# # Save time by saving dataframe to a CSV file.
# grouped.to_csv('back_up.csv', index=False)
# # read_from_csv = False

In [None]:
# Load artist embeddings from pickle file
with open("../data_storage/artist_embeddings.pkl", "rb") as f:
    artist_embeddings = pickle.load(f)

# Load playlist name embeddings from pickle file
with open("../data_storage/playlist_name_embeddings.pkl", "rb") as f:
    playlist_embeddings = pickle.load(f)

In [None]:
conn = sqlite3.connect('../data_storage/spotify.db')

# SQL query to get artists and their Spotify URIs
sql_query = '''
    SELECT * FROM artist;
'''

# Execute the SQL query and load results into a artists DataFrame
artist_df = pd.read_sql_query(sql_query, conn)

# Create dictionary to map name to URI
artist_dict = artist_df.set_index('artist_name')['artist_uri'].to_dict()


In [None]:
from sklearn.model_selection import train_test_split
import random
import ast

#first clean dataset and remove playlists with < 20 songs
df = df[df['songs'].apply(len) >= 20]

# Function to create train and test sets from the entire dataset
def createDatasets(filtered_grouped, k, read_from_csv, embeddings_dict, average_embedding, artist_dict, artist_embeddings, playlist_embeddings_dict):
 
    # Split data into train and test sets
    train_df, test_df = train_test_split(filtered_grouped, test_size=0.2, random_state=42)

    # Function to process each subset
    def process_data(df, read_from_csv, k):
        # Stores tensors of concatenated song, artist, and playlist embeddings for each playlist
        tensors = [] 
        # Stores the remaining songs in the playlist to be predicted
        remaining_songs = [] 
        
        
        if k == 0:
           for playlist_id in df['pid'].unique():
            playlist_embedding = playlist_embeddings_dict.get(playlist_id)
            if playlist_embedding is not None:
                playlist_tensor = torch.tensor(playlist_embedding, dtype=torch.float).unsqueeze(0)
                tensors.append(playlist_tensor)
            # Assuming all songs remain "unselected" since k=0
            remaining_songs.append(df[df['pid'] == playlist_id]['songs'].tolist())
    
        else:

            for _, row in df.iterrows():
                p_songs = row['songs']
                p_artists = row['artists']
                playlist_name = row['playlist_name']
                if read_from_csv and isinstance(p_songs, str):
                    p_songs = ast.literal_eval(p_songs)
                    p_artists = ast.literal_eval(p_artists)
                
                if len(p_songs) >= k:
                    selected_songs = random.sample(p_songs, k)
                    selected_artists = [p_artists[p_songs.index(song)] for song in selected_songs]
    
                    playlist_embedding = playlist_embeddings_dict.get(playlist_name, torch.zeros_like(average_embedding))
    
                    # Retrieve and concatenate artist embeddings to track embeddings, and add playlist embeddings
                    p_embeddings = []
                    for song, artist in zip(selected_songs, selected_artists):
                        song_embedding = embeddings_dict.get(song, average_embedding)
                        artist_uri = artist_dict.get(artist, None)
                        if artist_uri:
                            artist_embedding = torch.tensor(artist_embeddings.get(artist_uri, torch.zeros_like(average_embedding)))
                        else:
                            artist_embedding = torch.zeros_like(average_embedding)
                        
                        if not isinstance(song_embedding, torch.Tensor) or not isinstance(artist_embedding, torch.Tensor):
                            raise ValueError("Embedding is not a tensor.")
    
                        combined_embedding = torch.cat((song_embedding, artist_embedding, playlist_embedding), dim=0)
                        p_embeddings.append(combined_embedding)
    
                    tensors.append(torch.stack(p_embeddings, dim=0))
                    remaining_songs.append([song for song in p_songs if song not in selected_songs])
    
        # Convert embeddings to tensors
        tensor = torch.stack(tensors, dim=0) if tensors else torch.tensor([])

        return tensor, remaining_songs

    # Process training data
    x_train, y_train = process_data(train_df, read_from_csv, k)
    
    # Process testing data
    x_test, y_test = process_data(test_df, read_from_csv, k)
    
    return x_train, y_train, x_test, y_test


In [None]:
import numpy as np

def r_precision(y_true, y_pred):
    # Convert to set
    y_true_set = set(y_true)

    # Determine the number of relevant items
    r = len(y_true_set)
    
    # Filter y_pred to only include items that are in y_true
    relevant_predictions = [song for song in y_pred if song in y_true_set]
    
    # Calculate the number of relevant items that should be considered
    r_actual = min(r, len(relevant_predictions))
    
    # Count how many of the top-r_actual predictions are in the true set of relevant songs
    relevant_count = len(relevant_predictions[:r_actual])
    
    # Avoid division by zero if no relevant items
    if r == 0:
        return 0  
    return relevant_count / r

def recommended_songs_clicks(y_true, y_pred):
    # Convert to set
    true_set = set(y_true)
    
    # Find the first relevant track in the predictions
    for i, track in enumerate(y_pred, start=1):
        if track in true_set:
            # How many blocks of 10 tracks are needed
            return (i - 1) // 10 + 1
    
    # If no relevant track is found, the value of 51 is returned
    return 51

# Implement DCG Equation
def dcg(relevances, rank):
    relevances = np.array(relevances)
    if relevances.size:
        return relevances[0] + np.sum(relevances[1:] / np.log2(np.arange(2, relevances.size + 1)))
    return 0

# Find ideal DCG
def idcg(relevances):
    sorted_relevances = sorted(relevances, reverse=True)
    return dcg(sorted_relevances, len(sorted_relevances))

# Calculate NDCG
def ndcg(y_true, y_pred):
    relevances = [1 if song in y_true else 0 for song in y_pred]
    actual_dcg = dcg(relevances, len(relevances))
    ideal_dcg = idcg(relevances)
    if ideal_dcg == 0:
        return 0
    return actual_dcg / ideal_dcg

# Custom loss function for training the model based on the above metrics
def playlist_loss(y_true, y_pred):
    total_loss = 0
    num_samples = len(y_true)
    
    for i in range(num_samples):
        # Convert y_true and y_pred to sets for quick lookup
        y_true_set = set(y_true[i])
        y_pred_set = set(y_pred[i])

        # Calculate R-Precision
        r_precision_value = r_precision(y_true_set, y_pred_set)

        # Calculate NDCG
        ndcg_value = ndcg(y_true_set, y_pred_set)

        # Calculate clicks
        clicks_value = recommended_songs_clicks(y_true_set, y_pred_set)

        # Define weights
        weight_r = 1
        weight_ndcg = 1
        # Adjusted weight for clicks
        weight_clicks = 1/50 

        # Calculate individual losses
        r_loss = 1 - r_precision_value
        ndcg_loss = 1 - ndcg_value

        # Subtract 1/50 to keep loss within the range of 0 to 3, as 1 click (1/50) is the minimum
        clicks_loss = clicks_value * weight_clicks - 1/50

        # Combine individual losses (out of a maximum of 3)
        loss = (r_loss * weight_r) + (ndcg_loss * weight_ndcg) + clicks_loss

        # Accumulate total loss
        total_loss += loss
    
    # Calculate average loss
    average_loss = total_loss / num_samples
    
    return average_loss


In [None]:
import torch
import torch.nn as nn

filtered_embeddings_dict = {}

# Calculate playlist counts for each song URI
playlist_counts = {}
for songs_list in df['songs']:
    for song_uri in songs_list:
        playlist_counts[song_uri] = playlist_counts.get(song_uri, 0) + 1

# Filter embeddings dictionary to include only songs that appear in more than 300 playlists
for song_uri, embedding in embeddings_dict.items():
    # print(song_uri)
    if playlist_counts.get(song_uri, 0) > 300:
        filtered_embeddings_dict[song_uri] = embedding


In [None]:
class ClosestEmbeddingsLayer(nn.Module):
    def __init__(self, num_closest=500):
        super(ClosestEmbeddingsLayer, self).__init__()
        self.num_closest = num_closest
    
    def forward(self, generated_embeddings):
        closest_embeddings_batch = []
        generated_embeddings = generated_embeddings.to('cpu')
        # Prepare an ordered list of URIs corresponding to the embeddings
        uris = list(filtered_embeddings_dict.keys())
        for generated_embedding in generated_embeddings:
            # Stack embeddings
            embedding_arrays = np.stack(list(filtered_embeddings_dict.values()))
            # Compute dot product between all embeddings and the generated embedding
            scores = np.dot(embedding_arrays, generated_embedding.detach().numpy())
            # Get indices of the top 500 closest embeddings
            top_500_indices = np.argpartition(scores, -500)[-500:]
            # Sort the top indices by simlarity
            top_500_indices = top_500_indices[np.argsort(-scores[top_500_indices])]
            # Get URIs corresponding to these indices
            closest_uris = [uris[index] for index in top_500_indices]
            closest_embeddings_batch.append(closest_uris)

        return closest_embeddings_batch

class DynamicRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DynamicRNN, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.closest_embeddings_layer = ClosestEmbeddingsLayer()

    def forward(self, x):
        output, (hidden, cell) = self.lstm(x)
        output = self.fc(output[:, -1, :])  
        # Use the custom embeddings layer to find the closest 500 embeddings
        closest_embeddings = self.closest_embeddings_layer(output)
        return closest_embeddings



In [None]:
def train_model_for_k(model, x_train, y_train, epochs, batch_size, lr, k):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    x_train.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for i in range(0, len(x_train), batch_size):
            # print(f'Batch: {i}')
            optimizer.zero_grad()
            batch_input = x_train[i:i+batch_size].to(device)
            batch_target = y_train[i:i+batch_size]
            print(f'Trained: {i} / {len(x_train)}')
            output = model(batch_input)
            # Calculate loss using custom loss function
            loss = playlist_loss(batch_target, output)
            total_loss += loss
            # Print loss of each batch
            print(f'Loss: {loss}')
            # Convert loss to a tensor
            loss_tensor = torch.tensor(loss, requires_grad=True, device=device) 
            # Compute gradients
            loss_tensor.backward()
            # Update weights
            optimizer.step()
        
        print(f'Epoch {epoch+1}, Loss: {total_loss / (len(x_train) / batch_size)}')
        
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, f'model_checkpoint{k}.pth')
    

def evaluate_model(model, x_test, y_test, batch_size):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for i in range(0, len(x_test), batch_size):
            batch_input = x_test[i:i+batch_size].to(device)
            batch_target = y_test[i:i+batch_size]
            val_output = model(batch_input)
            # val_closest_embeddings = closest_embeddings_layer(val_output)
            val_loss = playlist_loss(batch_target, val_output)
            total_loss += val_loss

    print(f'Test Loss: {total_loss / (len(x_test) / batch_size)}')


# Set hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 1
batch_size = 32
lr = 0.01

# Length 192 combined track, artist and playlist embeddings
input_size = 192
hidden_size = 128
# Length 64 track embedding output
output_size = 64



In [None]:
# Reduce dataset size to speed up training if necessary
df = df.sample(frac=0.05)

k_values = [1, 5, 10, 25, 100]
# k_values = [10]

# Train and evaluate model for each value of k
for k in k_values:
    print(f'Training model for k = {k}')
    if k == 0:
        input_size = 64
    # Create datasets using defined function
    x_train, y_train, x_test, y_test = createDatasets(df, k, True, filtered_embeddings_dict, average_embedding, artist_dict, artist_embeddings, playlist_embeddings)
    # Create instance of the defined RNN model
    model = DynamicRNN(input_size=input_size, hidden_size=hidden_size, output_size=output_size).to(device)
    # Train the model using defined training loop
    train_model_for_k(model, x_train, y_train, epochs=epochs, batch_size=batch_size, lr=lr, k=k)
    # Save model as pickle file
    with open(f'model{k}.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    # Evaluate the trained model
    evaluate_model(model, x_test, y_test, batch_size)

In [None]:
## FOR LOADING MODEL FROM CHECKPOINT

model = DynamicRNN(input_size=192, hidden_size=128, output_size=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])