In [9]:
import json
import pandas as pd
import numpy as np
import os
import seaborn as sns
import re
from collections import *
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.sparse import csr_matrix, vstack, csr_array
import time
import random
import webbrowser
import pickle, gzip, joblib, shelve
import tkinter as tk
from tkinter import ttk, font
import threading, time
from itertools import islice, combinations
from datetime import datetime
import h5py
from multiprocessing import Pool, cpu_count

In [10]:
directory_path = 'data/raw'
filenames = sorted(os.listdir(directory_path))
print(f"{len(filenames) * 1000} playlists")

1000000 playlists


In [11]:
# looking at only the first 30,000 playlists
fullpaths = [directory_path + '/' + f for f in filenames][0:985]
save = True

In [None]:
song_relationships = {}
iteration_times = []

song_uris_set = set()

for idx, path in enumerate(fullpaths):
    start_time = time.time()

    with open(path) as f:
        mpd_slice = json.load(f)

    playlists_data = mpd_slice['playlists']

    for playlist in playlists_data:
        songs_set = set()

        for track in playlist['tracks']:
            song_uri = track['track_uri'].split(':')[-1]            
            songs_set.add(song_uri)

        # Compute song-to-song relationships for the current playlist
        pair_counts = Counter(combinations(songs_set, 2))
        
        for (song1, song2), count in pair_counts.items():
            song_relationships.setdefault(song1, {}).setdefault(song2, 0)
            song_relationships[song1][song2] += count
            song_relationships.setdefault(song2, {}).setdefault(song1, 0)
            song_relationships[song2][song1] += count

    end_time = time.time()
    iteration_time = end_time - start_time
    iteration_times.append(iteration_time)

    if idx % 10 == 0 and idx > 0:
        t = np.sum(iteration_times[-10:])
        print(f"processing {idx - 10}-{idx} - time taken {t:.2f}")

print(f'{len(song_relationships)} songs processed')

if save:
    formatted_time = datetime.now().strftime('%H_%d_%m_%Y')

    save_path = os.path.join('song_data', f'{formatted_time}_song_relationships.gz')

    with gzip.open(save_path, 'wb') as f:
        pickle.dump(song_relationships, f, protocol = pickle.HIGHEST_PROTOCOL)
    print(f"'song_relationships' saved to {save_path}")

In [4]:
# to load song_relationships from storage

save_path = os.path.join('song_data', '18_14_08_2023_song_relationships.gz')

with gzip.open(save_path, 'rb') as f:
    song_relationships = pickle.load(f)

In [None]:
song_data_map = {}
for idx, path in enumerate(fullpaths):
    if idx % 100 == 0 and idx > 0:
        print(f"Processed {idx-100}-{idx}")
    with open(path) as f:
        mpd_slice = json.load(f)
    playlists_data = mpd_slice['playlists']
    for playlist in playlists_data:
        for track in playlist['tracks']:
            song_uri = track['track_uri'].split(':')[-1]
            song_name = track['track_name']
            album_name = track['album_name']
            artist_name = track['artist_name']
            artist_uri = track['artist_uri']
            album_uri = track['album_uri']
            if song_uri in song_relationships:
                song_data_map[song_uri] = {'song_name': song_name, 'album_name': album_name, 'artist_name': artist_name,
                                          'artist_uri': artist_uri, 'album_uri': album_uri}
                
print(f'{len(song_data_map)} songs processed')

if save:
    formatted_time = datetime.now().strftime('%H_%d_%m_%Y')
    save_path = os.path.join('song_data', f'{formatted_time}_song_data_map.gz')

    with gzip.open(save_path, 'wb') as f:
        pickle.dump(song_data_map, f)
    print(f"'song_data_map' saved to {save_path}")

In [6]:
# to load song_data_map from storage
save_path = os.path.join('song_data', f'20_14_08_2023_song_data_map.gz')

with gzip.open(save_path, 'rb') as f:
    song_data_map = pickle.load(f)
    
print(len(song_data_map))

2244869


In [None]:
song_indices = {song_uri: idx for idx, song_uri in enumerate(song_relationships.keys())}

if save:
    formatted_time = datetime.now().strftime('%H_%d_%m_%Y')
    save_path = os.path.join('song_data', f'{formatted_time}_song_indices.gz')
    with gzip.open(save_path, 'wb') as f:
        pickle.dump(song_indices, f, protocol = pickle.HIGHEST_PROTOCOL)
    print(f"'song_indices' saved to {save_path}")

num_songs = len(song_indices)

print(num_songs)

In [5]:
# to load song_indices from storage
save_path = os.path.join('song_data', f'20_14_08_2023_song_indices.gz')

with gzip.open(save_path, 'rb') as f:
    song_indices = pickle.load(f)

num_songs = len(song_indices)
print(num_songs)

2244869


In [9]:
max_connections = 0
song_with_most_connections = None

for song, connections in song_relationships.items():
    num_connections = len(connections)
    if num_connections > max_connections:
        max_connections = num_connections
        song_with_most_connections = song

print("Song with the most connections:", song_data_map[song_with_most_connections])
print("Number of connections:", max_connections)


Song with the most connections: {'song_name': 'Closer', 'album_name': 'Closer', 'artist_name': 'The Chainsmokers', 'artist_uri': 'spotify:artist:69GGBxA162lTqCwzJG5jLp', 'album_uri': 'spotify:album:0rSLgV8p5FzfnqlEk4GzxE'}
Number of connections: 203549


In [17]:
def slice_dict(d, start, end):
    return dict(islice(d.items(), start, end))

def update_matrix(matrix, chunk):
    data = []
    row_indices = []
    col_indices = []

    for song_uri, relationships in chunk.items():
        row_idx = song_indices[song_uri]
        for related_song_uri, count in relationships.items():
            col_idx = song_indices[related_song_uri]
            
            if row_idx <= col_idx:
                data.append(count)
                row_indices.append(row_idx)
                col_indices.append(col_idx)
                
                if row_idx != col_idx:
                    data.append(count)
                    row_indices.append(col_idx)
                    col_indices.append(row_idx)

    # Create a temporary csr_matrix
    temp_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(num_songs, num_songs), dtype=np.int32)
    
    return matrix + temp_matrix

cooccurrence_matrix = csr_matrix((num_songs, num_songs), dtype=np.int32)

chunk_size = 10000

num_chunks = (num_songs + chunk_size - 1) // chunk_size

for start_index in range(0, num_songs, chunk_size):
    end_index = min(start_index + chunk_size, num_songs)
    current_relationships = slice_dict(song_relationships, start_index, end_index)
    cooccurrence_matrix = update_matrix(cooccurrence_matrix, current_relationships)
    print(f"processed chunk {start_index//chunk_size + 1}/{num_chunks}")

# Now, the full_matrix is your final cooccurrence_matrix

print(f'finished processing matrix size: {cooccurrence_matrix.shape}')

if save:
    formatted_time = datetime.now().strftime('%H_%d_%m_%Y')
    save_path = os.path.join('song_data', f'{formatted_time}_cooccurrence_matrix.gz')
    with gzip.open(save_path, 'wb') as f:
        pickle.dump(cooccurrence_matrix, f, protocol = pickle.HIGHEST_PROTOCOL)
    print(f"'cooccurrence_matrix' saved to {save_path}")

processed chunk 1/225
processed chunk 2/225
processed chunk 3/225
processed chunk 4/225
processed chunk 5/225
processed chunk 6/225
processed chunk 7/225
processed chunk 8/225
processed chunk 9/225
processed chunk 10/225
processed chunk 11/225
processed chunk 12/225
processed chunk 13/225
processed chunk 14/225
processed chunk 15/225
processed chunk 16/225
processed chunk 17/225
processed chunk 18/225
processed chunk 19/225
processed chunk 20/225
processed chunk 21/225
processed chunk 22/225
processed chunk 23/225
processed chunk 24/225
processed chunk 25/225
processed chunk 26/225
processed chunk 27/225
processed chunk 28/225
processed chunk 29/225
processed chunk 30/225
processed chunk 31/225
processed chunk 32/225
processed chunk 33/225
processed chunk 34/225
processed chunk 35/225
processed chunk 36/225
processed chunk 37/225
processed chunk 38/225
processed chunk 39/225
processed chunk 40/225
processed chunk 41/225
processed chunk 42/225
processed chunk 43/225
processed chunk 44/2

In [None]:
def sparsity(csr_matrix):
    total_elements = csr_matrix.shape[0] * csr_matrix.shape[1]
    non_zero_elements = csr_matrix.nnz
    sparsity = (total_elements - non_zero_elements) / total_elements
    return sparsity

In [4]:
# to load cooccurrence_matrix from storage
save_path = os.path.join('song_data', f'02_15_08_2023_cooccurrence_matrix.gz')

with gzip.open(save_path, 'rb') as f:
    cooccurrence_matrix = pickle.load(f)
    
print(cooccurrence_matrix.shape)

(2244869, 2244869)


In [15]:
k = 1.1
total_occurrences = np.sum(cooccurrence_matrix)
p_i = np.sum(cooccurrence_matrix, axis=1) / total_occurrences
p_i = np.asarray(p_i).flatten()
p_ij = cooccurrence_matrix / total_occurrences

def compute_pmi_for_chunk(start_row, end_row):
    # Lists to store data for this chunk
    pmi_data_chunk = []
    row_indices_chunk = []
    col_indices_chunk = []

    for i in range(start_row, end_row):
        for data_idx in range(p_ij.indptr[i], p_ij.indptr[i + 1]):
            j = p_ij.indices[data_idx]

            if p_ij.data[data_idx] > 0:  # Avoid log(0)
                original_pmi = np.log2(p_ij.data[data_idx] / (p_i[i] * p_i[j]))
                pmi_score = original_pmi - (-(k - 1) * np.log2(p_ij.data[data_idx]))
                if pmi_score < 0:
                    pmi_data_chunk.append(pmi_score)
                    row_indices_chunk.append(i - start_row)  # Adjust the row index relative to the chunk
                    col_indices_chunk.append(j)
                
    return pmi_data_chunk, row_indices_chunk, col_indices_chunk

if save:
    # Define chunk size
    chunk_size = 5000
    saved_chunk_files = []
    formatted_time = datetime.now().strftime('%H_%d_%m_%Y')

    for start in range(0, cooccurrence_matrix.shape[0], chunk_size):
        end = min(start + chunk_size, cooccurrence_matrix.shape[0])
        print(f"Processing rows {start} to {end}")
        
        data, row, col = compute_pmi_for_chunk(start, end)
        chunk_matrix = csr_matrix((data, (row, col)), shape=(end-start, cooccurrence_matrix.shape[1]), dtype=np.float64)

        save_path = os.path.join('song_data/pmi/12_16_08_2023_1.1', f'{formatted_time}_{start}_{end}_{k}_pmi_matrix.gz')
        with gzip.open(save_path, 'wb') as f:
            pickle.dump(chunk_matrix, f, protocol = pickle.HIGHEST_PROTOCOL)
        
        saved_chunk_files.append(save_path)

    # Combining saved chunks
    chunks = []
    for chunk_file in saved_chunk_files:
        with gzip.open(chunk_file, 'rb') as f:
            chunks.append(pickle.load(f))

    pmi_matrix = vstack(chunks, format='csr')
    print(f'Finished processing matrix size: {pmi_matrix.shape}')

else:
    chunk_size = 5000
    chunks = []

    for start in range(0, cooccurrence_matrix.shape[0], chunk_size):
        end = min(start + chunk_size, cooccurrence_matrix.shape[0])
        print(f"Processing rows {start} to {end}")
        
        data, row, col = compute_pmi_for_chunk(start, end)
        chunk_matrix = csr_matrix((data, (row, col)), shape=(end-start, cooccurrence_matrix.shape[1]), dtype=np.float64)
        chunks.append(chunk_matrix)
        
    pmi_matrix = vstack(chunks, format='csr')
    print(f'Finished processing matrix size: {pmi_matrix.shape}')

Processing rows 0 to 5000
Processing rows 5000 to 10000
Processing rows 10000 to 15000
Processing rows 15000 to 20000
Processing rows 20000 to 25000
Processing rows 25000 to 30000
Processing rows 30000 to 35000
Processing rows 35000 to 40000
Processing rows 40000 to 45000
Processing rows 45000 to 50000
Processing rows 50000 to 55000
Processing rows 55000 to 60000
Processing rows 60000 to 65000
Processing rows 65000 to 70000
Processing rows 70000 to 75000
Processing rows 75000 to 80000
Processing rows 80000 to 85000
Processing rows 85000 to 90000
Processing rows 90000 to 95000
Processing rows 95000 to 100000
Processing rows 100000 to 105000
Processing rows 105000 to 110000
Processing rows 110000 to 115000
Processing rows 115000 to 120000
Processing rows 120000 to 125000
Processing rows 125000 to 130000
Processing rows 130000 to 135000
Processing rows 135000 to 140000
Processing rows 140000 to 145000
Processing rows 145000 to 150000
Processing rows 150000 to 155000
Processing rows 155000

In [14]:
# to load pmi_matrix from storage

folder_path = 'song_data/pmi'  # Replace with the path to your folder
saved_chunk_files = [folder_path + '/' + f for f in os.listdir(folder_path) if "15_08_2023" in f]
saved_chunk_files = sorted(saved_chunk_files, key = lambda x: int(x.split('_')[5]))
chunks = []
for chunk_file in saved_chunk_files:
    with gzip.open(chunk_file, 'rb') as f:
        chunks.append(pickle.load(f))

pmi_matrix = vstack(chunks, format='csr')
print(f'Finished processing matrix size: {pmi_matrix.shape}')

Finished processing matrix size: (2244869, 2244869)


In [None]:
def user_playlist_vector(playlist_songs, song_indices, num_songs):
    if not playlist_songs:
        return csr_matrix(np.ones((1, num_songs)))
    
    indices = [song_indices[song] for song in playlist_songs]
    data = np.ones(len(indices))
    indptr = np.array([0, len(indices)])
    return csr_matrix((data, indices, indptr), shape=(1, num_songs))

def compute_scores(user_vector, pmi_matrix):
    s = time.time()
    scores = user_vector @ pmi_matrix
    e = time.time()
    print(e - s)
    return scores.toarray()[0]

# def get_top_recommendations(scores, song_data_map, song_indices, n=10):
#     s = time.time()
#     top_indices = np.argsort(scores)[-n:][::-1]
#     top_songs = [song_data_map[list(song_indices)[i]] for i in top_indices]
#     e = time.time()
#     print(e - s)
#     return top_songs

def get_top_recommendations(scores, song_data_map, song_indices, n=10):
    s = time.time()
    
    # Get the top n indices without sorting the entire array
    top_indices = np.argpartition(scores, -n)[-n:]
    # Now, sort only the top n indices
    top_indices_sorted = top_indices[np.argsort(scores[top_indices])][::-1]
    
    song_indices_list = list(song_indices.keys())
    top_songs = [song_data_map[song_indices_list[i]] for i in top_indices_sorted]
    
    e = time.time()
    print(e - s)
    return top_songs

def recommend_songs_pmi(user_playlist, song_indices, pmi_matrix, song_data_map, n=10):
    user_vector = user_playlist_vector(user_playlist, song_indices, num_songs)
    scores = compute_scores(user_vector, pmi_matrix)
    return get_top_recommendations(scores, song_data_map, song_indices, n)

In [None]:
class SongRecommendationApp(tk.Tk):
    def __init__(self, song_data_map):
        super().__init__()

        default_font = font.nametofont("TkDefaultFont")
        default_font.configure(family="Courier")

        self.song_data_map = song_data_map
        self.uri_map = {self.format_song_display(song_info): uri for uri, song_info in song_data_map.items()}
        self.playlist_data = []  # Store song data for sorting

        # Filter Frame
        self.filter_frame = ttk.Frame(self)
        self.filter_frame.pack(pady=10)

        # Label and Entry for Song
        self.song_label = ttk.Label(self.filter_frame, text="Song")
        self.song_label.grid(row=0, column=0, padx=5)
        self.song_entry = ttk.Entry(self.filter_frame)
        self.song_entry.grid(row=1, column=0, padx=5)

        # Label and Entry for Artist
        self.artist_label = ttk.Label(self.filter_frame, text="Artist")
        self.artist_label.grid(row=0, column=1, padx=5)
        self.artist_entry = ttk.Entry(self.filter_frame)
        self.artist_entry.grid(row=1, column=1, padx=5)

        # Label and Entry for Album
        self.album_label = ttk.Label(self.filter_frame, text="Album")
        self.album_label.grid(row=0, column=2, padx=5)
        self.album_entry = ttk.Entry(self.filter_frame)
        self.album_entry.grid(row=1, column=2, padx=5)

        # Debounce logic
        self.last_time = time.time()

        self.search_button = ttk.Button(self.filter_frame, text="Search", command=self.display_search_results)
        self.search_button.grid(row=2, columnspan=3, pady=10)

        width = 200
        # Songs Listbox
        self.songs_listbox = tk.Listbox(self, selectmode=tk.SINGLE, width=width, font=("Courier", 10))
        self.songs_listbox.pack(pady=10)

        # Drag & Drop functionality
        self.songs_listbox.bind('<<ListboxSelect>>', self.add_to_playlist)

        # Playlist Listbox
        self.playlist_listbox = tk.Listbox(self, bg="lightblue", selectmode=tk.SINGLE, width=width, font=("Courier", 10))
        self.playlist_listbox.pack(pady=10)

        # Number of recommendations
        self.n_label = ttk.Label(self, text="Number of Recommendations:")
        self.n_label.pack(pady=5)
        self.n_entry = ttk.Entry(self)
        self.n_entry.pack(pady=5)

        # Button to generate recommendations
        self.btn_recommend = ttk.Button(self, text="Generate Recommendations", command=self.generate_recommendations)
        self.btn_recommend.pack(pady=10)

        # Recommendations Listbox
        self.recommendations_listbox = tk.Listbox(self, bg="lightgreen", selectmode=tk.SINGLE, width=width, font=("Courier", 10))
        self.recommendations_listbox.pack(pady=10)

        self.recommendations_listbox.bind('<Double-Button-1>', self.open_in_spotify)

        self.btn_refresh = ttk.Button(self, text="Refresh", command=self.refresh)
        self.btn_refresh.pack(pady=10)
    
    def refresh(self):
        # Clear all fields
        self.song_entry.delete(0, tk.END)
        self.artist_entry.delete(0, tk.END)
        self.album_entry.delete(0, tk.END)
        self.n_entry.delete(0, tk.END)
        
        # Clear listboxes
        self.songs_listbox.delete(0, tk.END)
        self.playlist_listbox.delete(0, tk.END)
        self.recommendations_listbox.delete(0, tk.END)

    def open_in_spotify(self, event):
            selected_index = self.recommendations_listbox.curselection()
            if selected_index:
                selected_song = self.recommendations_listbox.get(selected_index)
                song_uri = self.uri_map[selected_song]
                webbrowser.open(f"https://open.spotify.com/track/{song_uri}")
                
    def display_search_results(self):
        song_query = self.song_entry.get().lower()
        artist_query = self.artist_entry.get().lower()
        album_query = self.album_entry.get().lower()

        self.songs_listbox.delete(0, tk.END)
        results = []  # Store the filtered results first

        for uri, song_info in self.song_data_map.items():
            if song_query in song_info['song_name'].lower() and artist_query in song_info['artist_name'].lower() and album_query in song_info['album_name'].lower():
                display_name = self.format_song_display(song_info)
                results.append(display_name)

        # Sort by album name
        results.sort(key=lambda x: self.song_data_map[self.uri_map[x]]['album_name'])

        # Display the sorted results
        for display_name in results:
            self.songs_listbox.insert(tk.END, display_name)

        if len(results) > 300:  # If you want to limit the displayed results
            self.songs_listbox.delete(301, tk.END)

    def format_song_display(self, song_info):
        formatted_str = "{:<65}{:<35}{:<35}"
        f_string = formatted_str.format(song_info['song_name'], song_info['artist_name'], song_info['album_name'])
        return f_string

    def add_to_playlist(self, event):
        selected_index = self.songs_listbox.curselection()
        if selected_index:  # This checks if there's any selection at all
            selected_song = self.songs_listbox.get(selected_index)
            if selected_song not in self.playlist_listbox.get(0, tk.END):  # Prevent duplicates
                self.playlist_listbox.insert(tk.END, selected_song)

    def generate_recommendations(self):
        s = time.time()
        playlist_display_names = list(self.playlist_listbox.get(0, tk.END))
        playlist_uris = [self.uri_map[display_name] for display_name in playlist_display_names]  # Extract URIs

        n = int(self.n_entry.get())
        recommended_songs = recommend_songs_pmi(playlist_uris, song_indices, pmi_matrix, song_data_map, n)

        self.recommendations_listbox.delete(0, tk.END)
        for song in recommended_songs:
            formatted_song = self.format_song_display(song)
            self.recommendations_listbox.insert(tk.END, formatted_song)
        e = time.time()
        print(e - s)

if __name__ == "__main__":
    app = SongRecommendationApp(song_data_map)
    app.mainloop()
