In [1]:
import json
import pandas as pd
import numpy as np
import os
import seaborn as sns
import re
from collections import *
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack, csr_array
import time
import random
import webbrowser
import pickle, gzip, joblib, shelve
import tkinter as tk
from tkinter import ttk, font
import threading, time
from itertools import islice, combinations
from datetime import datetime
from scipy.sparse.linalg import eigsh

In [2]:
# get the data
directory_path = 'data'
filenames = sorted(os.listdir(directory_path))
print(f"{len(filenames) * 1000} playlists")

1000000 playlists


In [3]:
# looking at only the first 1,000 playlists
fullpaths = [directory_path + '/' + f for f in filenames][0:1]
save = False

In [4]:
song_relationships = {}
iteration_times = []

song_uris_set = set()

for idx, path in enumerate(fullpaths):
    start_time = time.time()

    with open(path) as f:
        mpd_slice = json.load(f)

    playlists_data = mpd_slice['playlists']

    for playlist in playlists_data:
        songs_set = set()
        albums_set = set()
        artists_set = set()

        for track in playlist['tracks']:
            song_uri = track['track_uri'].split(':')[-1]
            album_uri = track['album_uri'].split(':')[-1]
            artist_uri = track['artist_uri'].split(':')[-1]
            
            songs_set.add(song_uri)
            albums_set.add(album_uri)
            artists_set.add(artist_uri)

        # Compute song-to-song relationships for the current playlist
        pair_counts = Counter(combinations(songs_set, 2))
        
        for (song1, song2), count in pair_counts.items():
            song_relationships.setdefault(song1, {}).setdefault(song2, 0)
            song_relationships[song1][song2] += count
            song_relationships.setdefault(song2, {}).setdefault(song1, 0)
            song_relationships[song2][song1] += count

    end_time = time.time()
    iteration_time = end_time - start_time
    iteration_times.append(iteration_time)

    if idx % 10 == 0 and idx > 0:
        t = np.sum(iteration_times[-10:])
        print(f"processing {idx - 10}-{idx} - time taken {t:.2f}")

print(f'{len(song_relationships)} songs processed')

if save:
    formatted_time = datetime.now().strftime('%H_%d_%m_%Y')

    save_path = os.path.join('song_data', f'{formatted_time}_song_relationships.gz')

    with gzip.open(save_path, 'wb') as f:
        pickle.dump(song_relationships, f, protocol = pickle.HIGHEST_PROTOCOL)
    print(f"'song_relationships' saved to {save_path}")

34443 songs processed


In [None]:
# to load song_relationships from storage

save_path = os.path.join('song_data', '08_13_08_2023_song_relationships.gz')

with gzip.open(save_path, 'rb') as f:
    song_relationships = pickle.load(f)

In [5]:
song_data_map = {}
for idx, path in enumerate(fullpaths):
    if idx % 10 == 0 and idx > 0:
        print(f"Processed {idx-10}-{idx}")
    with open(path) as f:
        mpd_slice = json.load(f)
    playlists_data = mpd_slice['playlists']
    for playlist in playlists_data:
        for track in playlist['tracks']:
            song_uri = track['track_uri'].split(':')[-1]
            song_name = track['track_name']
            album_name = track['album_name']
            artist_name = track['artist_name']
            if song_uri in song_relationships:
                song_data_map[song_uri] = {'song_name': song_name, 'album_name': album_name, 'artist_name': artist_name}

print(f'{len(song_data_map)} songs processed')

if save:
    formatted_time = datetime.now().strftime('%H_%d_%m_%Y')
    save_path = os.path.join('song_data', f'{formatted_time}_song_data_map.gz')

    with gzip.open(save_path, 'wb') as f:
        pickle.dump(song_data_map, f)
    print(f"'song_data_map' saved to {save_path}")

34443 songs processed


In [None]:
# to load song_data_map from storage
save_path = os.path.join('song_data', f'08_13_08_2023_song_data_map.gz')

with gzip.open(save_path, 'rb') as f:
    song_data_map = pickle.load(f)

In [6]:
song_indices = {song_uri: idx for idx, song_uri in enumerate(song_relationships.keys())}

if save:
    formatted_time = datetime.now().strftime('%H_%d_%m_%Y')
    save_path = os.path.join('song_data', f'{formatted_time}_song_indices.gz')
    with gzip.open(save_path, 'wb') as f:
        pickle.dump(song_indices, f, protocol = pickle.HIGHEST_PROTOCOL)
    print(f"'song_indices' saved to {save_path}")

num_songs = len(song_indices)

print(num_songs)

34443


In [None]:
# to load song_indices from storage
save_path = os.path.join('song_data', f'08_13_08_2023_song_indices.gz')

with gzip.open(save_path, 'rb') as f:
    song_indices = pickle.load(f)

num_songs = len(song_indices)
print(num_songs)

In [7]:
max_connections = 0
song_with_most_connections = None

for song, connections in song_relationships.items():
    num_connections = len(connections)
    if num_connections > max_connections:
        max_connections = num_connections
        song_with_most_connections = song

print("Song with the most connections:", song_data_map[song_with_most_connections])
print("Number of connections:", max_connections)

Song with the most connections: {'song_name': 'One Dance', 'album_name': 'Views', 'artist_name': 'Drake'}
Number of connections: 3520


In [8]:
full_matrix = csr_matrix((num_songs, num_songs), dtype=np.int32)

def slice_dict(d, start, end):
    return dict(islice(d.items(), start, end))

def update_matrix(chunk):
    global full_matrix
    
    data = []
    row_indices = []                
    col_indices = []
                                      
    for song_uri, relationships in chunk.items():
        row_idx = song_indices[song_uri]
        for related_song_uri, count in relationships.items():
            col_idx = song_indices[related_song_uri]
            
            # Only consider the upper triangle of the matrix
            if row_idx <= col_idx:
                data.append(count)
                row_indices.append(row_idx)
                col_indices.append(col_idx)
                
                # If it's not on the diagonal, add the symmetric entry
                if row_idx != col_idx:
                    data.append(count)
                    row_indices.append(col_idx)
                    col_indices.append(row_idx)

    # Create a temporary csr_matrix
    temp_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(num_songs, num_songs), dtype=np.int32)
    
    # Update the full_matrix in-place using the temp_matrix
    full_matrix += temp_matrix

chunk_size = 1000

num_chunks = (num_songs + chunk_size - 1) // chunk_size

for start_index in range(0, num_songs, chunk_size):
    end_index = min(start_index + chunk_size, num_songs)
    current_relationships = slice_dict(song_relationships, start_index, end_index)
    update_matrix(current_relationships)
    print(f"processed chunk {start_index//chunk_size + 1}/{num_chunks}")

# Now, the full_matrix is your final cooccurrence_matrix
cooccurrence_matrix = full_matrix

print(f'finished processing matrix size: {cooccurrence_matrix.shape}')

if save:
    formatted_time = datetime.now().strftime('%H_%d_%m_%Y')
    save_path = os.path.join('song_data', f'{formatted_time}_cooccurrence_matrix.gz')
    with gzip.open(save_path, 'wb') as f:
        pickle.dump(cooccurrence_matrix, f, protocol = pickle.HIGHEST_PROTOCOL)
    print(f"'cooccurrence_matrix' saved to {save_path}")

processed chunk 1/35
processed chunk 2/35
processed chunk 3/35
processed chunk 4/35
processed chunk 5/35
processed chunk 6/35
processed chunk 7/35
processed chunk 8/35
processed chunk 9/35
processed chunk 10/35
processed chunk 11/35
processed chunk 12/35
processed chunk 13/35
processed chunk 14/35
processed chunk 15/35
processed chunk 16/35
processed chunk 17/35
processed chunk 18/35
processed chunk 19/35
processed chunk 20/35
processed chunk 21/35
processed chunk 22/35
processed chunk 23/35
processed chunk 24/35
processed chunk 25/35
processed chunk 26/35
processed chunk 27/35
processed chunk 28/35
processed chunk 29/35
processed chunk 30/35
processed chunk 31/35
processed chunk 32/35
processed chunk 33/35
processed chunk 34/35
processed chunk 35/35
finished processing matrix size: (34443, 34443)


In [None]:
# to load cooccurrence_matrix from storage
save_path = os.path.join('song_data', f'15_13_08_2023_cooccurrence_matrix.gz')

with gzip.open(save_path, 'rb') as f:
    cooccurrence_matrix = pickle.load(f)

In [9]:
# transform co-occurrence matrix into a column stochastic matrix (transition matrix)
cooccurrence_matrix = cooccurrence_matrix.asfptype().todense()
for col in range(cooccurrence_matrix.shape[1]):
    cooccurrence_matrix[:,col] /= np.sum(cooccurrence_matrix[:,col])
    
cooccurrence_matrix = csr_matrix(cooccurrence_matrix)

In [19]:
def user_playlist_vector(playlist_songs, song_indices, num_songs):
    user_vector = np.zeros(num_songs)
    for song in playlist_songs:
        user_vector[song_indices[song]] = 1/len(playlist_songs)
    return user_vector

def get_recommendation_vector(user_vector, transition_matrix, steps, damping = True, damping_factor = 0.85):
    p = user_vector
    p = csr_matrix(p)
    if damping:
        random_jump_vector = csr_matrix(np.ones(num_songs) / num_songs)
        for _ in range(steps):
            p = damping_factor * np.dot(p, transition_matrix) + (1 - damping_factor) * random_jump_vector
    else:
        for _ in range(steps):
            p = np.dot(p, transition_matrix)
    return p.toarray()[0]

def get_top_recommendations(scores, song_data_map, song_indices, n=10):
    s = time.time()
    
    # Get the top n indices without sorting the entire array
    top_indices = np.argpartition(scores, -n)[-n:]
    # Now, sort only the top n indices
    top_indices_sorted = top_indices[np.argsort(scores[top_indices])][::-1]
    
    song_indices_list = list(song_indices.keys())
    top_songs = [song_data_map[song_indices_list[i]] for i in top_indices_sorted]
    
    e = time.time()
    #print(e - s)
    return top_songs

def recommend_songs(user_playlist, song_indices, matrix, song_data_map, steps, n=10):
    user_vector = user_playlist_vector(user_playlist, song_indices, len(song_indices))
    recommendation_vector = get_recommendation_vector(user_vector, matrix, steps, damping = True, damping_factor = 0.85)
    return get_top_recommendations(recommendation_vector, song_data_map, song_indices, n)

In [21]:
class SongRecommendationApp(tk.Tk):
    def __init__(self, song_data_map):
        super().__init__()

        default_font = font.nametofont("TkDefaultFont")
        default_font.configure(family="Courier")

        self.song_data_map = song_data_map
        self.uri_map = {self.format_song_display(song_info): uri for uri, song_info in song_data_map.items()}
        self.playlist_data = []  # Store song data for sorting

        # Filter Frame
        self.filter_frame = ttk.Frame(self)
        self.filter_frame.pack(pady=10)

        # Label and Entry for Song
        self.song_label = ttk.Label(self.filter_frame, text="Song")
        self.song_label.grid(row=0, column=0, padx=5)
        self.song_entry = ttk.Entry(self.filter_frame)
        self.song_entry.grid(row=1, column=0, padx=5)

        # Label and Entry for Artist
        self.artist_label = ttk.Label(self.filter_frame, text="Artist")
        self.artist_label.grid(row=0, column=1, padx=5)
        self.artist_entry = ttk.Entry(self.filter_frame)
        self.artist_entry.grid(row=1, column=1, padx=5)

        # Label and Entry for Album
        self.album_label = ttk.Label(self.filter_frame, text="Album")
        self.album_label.grid(row=0, column=2, padx=5)
        self.album_entry = ttk.Entry(self.filter_frame)
        self.album_entry.grid(row=1, column=2, padx=5)

        # Debounce logic
        self.last_time = time.time()

        self.search_button = ttk.Button(self.filter_frame, text="Search", command=self.display_search_results)
        self.search_button.grid(row=2, columnspan=3, pady=10)

        width = 200
        # Songs Listbox
        self.songs_listbox = tk.Listbox(self, selectmode=tk.SINGLE, width=width, font=("Courier", 10))
        self.songs_listbox.pack(pady=10)

        # Drag & Drop functionality
        self.songs_listbox.bind('<<ListboxSelect>>', self.add_to_playlist)

        # Playlist Listbox
        self.playlist_listbox = tk.Listbox(self, bg="lightblue", selectmode=tk.SINGLE, width=width, font=("Courier", 10))
        self.playlist_listbox.pack(pady=10)

        # Number of recommendations
        self.n_label = ttk.Label(self, text="Number of Recommendations:")
        self.n_label.pack(pady=5)
        self.n_entry = ttk.Entry(self)
        self.n_entry.pack(pady=5)

        # Button to generate recommendations
        self.btn_recommend = ttk.Button(self, text="Generate Recommendations", command=self.generate_recommendations)
        self.btn_recommend.pack(pady=10)

        # Recommendations Listbox
        self.recommendations_listbox = tk.Listbox(self, bg="lightgreen", selectmode=tk.SINGLE, width=width, font=("Courier", 10))
        self.recommendations_listbox.pack(pady=10)

        self.recommendations_listbox.bind('<Double-Button-1>', self.open_in_spotify)

        self.btn_refresh = ttk.Button(self, text="Refresh", command=self.refresh)
        self.btn_refresh.pack(pady=10)
    
    def refresh(self):
        # Clear all fields
        self.song_entry.delete(0, tk.END)
        self.artist_entry.delete(0, tk.END)
        self.album_entry.delete(0, tk.END)
        self.n_entry.delete(0, tk.END)
        
        # Clear listboxes
        self.songs_listbox.delete(0, tk.END)
        self.playlist_listbox.delete(0, tk.END)
        self.recommendations_listbox.delete(0, tk.END)

    def open_in_spotify(self, event):
            selected_index = self.recommendations_listbox.curselection()
            if selected_index:
                selected_song = self.recommendations_listbox.get(selected_index)
                song_uri = self.uri_map[selected_song]
                webbrowser.open(f"https://open.spotify.com/track/{song_uri}")
                
    def display_search_results(self):
        song_query = self.song_entry.get().lower()
        artist_query = self.artist_entry.get().lower()
        album_query = self.album_entry.get().lower()

        self.songs_listbox.delete(0, tk.END)
        results = []  # Store the filtered results first

        for uri, song_info in self.song_data_map.items():
            if song_query in song_info['song_name'].lower() and artist_query in song_info['artist_name'].lower() and album_query in song_info['album_name'].lower():
                display_name = self.format_song_display(song_info)
                results.append(display_name)

        # Sort by album name
        results.sort(key=lambda x: self.song_data_map[self.uri_map[x]]['album_name'])

        # Display the sorted results
        for display_name in results:
            self.songs_listbox.insert(tk.END, display_name)

        if len(results) > 300:  # If you want to limit the displayed results
            self.songs_listbox.delete(301, tk.END)

    def format_song_display(self, song_info):
        formatted_str = "{:<65}{:<35}{:<35}"
        f_string = formatted_str.format(song_info['song_name'], song_info['artist_name'], song_info['album_name'])
        return f_string

    def add_to_playlist(self, event):
        selected_index = self.songs_listbox.curselection()
        if selected_index:  # This checks if there's any selection at all
            selected_song = self.songs_listbox.get(selected_index)
            if selected_song not in self.playlist_listbox.get(0, tk.END):  # Prevent duplicates
                self.playlist_listbox.insert(tk.END, selected_song)

    def generate_recommendations(self):
        s = time.time()
        playlist_display_names = list(self.playlist_listbox.get(0, tk.END))
        playlist_uris = [self.uri_map[display_name] for display_name in playlist_display_names]  # Extract URIs

        n = int(self.n_entry.get())
        recommended_songs = recommend_songs(playlist_uris, song_indices, cooccurrence_matrix, song_data_map, 500, n)

        self.recommendations_listbox.delete(0, tk.END)
        for song in recommended_songs:
            formatted_song = self.format_song_display(song)
            self.recommendations_listbox.insert(tk.END, formatted_song)
        e = time.time()
        print(e - s)

if __name__ == "__main__":
    app = SongRecommendationApp(song_data_map)
    app.mainloop()

8.902404069900513
9.07368803024292
8.903209924697876
