In [7]:
import json
import pandas as pd
import numpy as np
import os
import seaborn as sns
import re
from collections import *
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.sparse import lil_matrix, csr_matrix, eye
import time
import random
from numpy.linalg import matrix_power

In [8]:
directory_path = 'data/raw'
filenames = sorted(os.listdir(directory_path))
print(f"{len(filenames) * 1000} playlists")

1000000 playlists


In [9]:
# looking at only the first 30,000 playlists
fullpaths = [directory_path + '/' + f for f in filenames][0:30]

In [4]:
# ratios = []
# for playlist in playlists_data:
#     r = playlist['num_albums']/playlist['num_tracks']
#     ratios.append(r)
# sns.histplot(ratios)

def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", " ", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

In [11]:
min_tracks_per_playlist = 5
max_tracks_per_playlist = 150
min_albums_per_playlist = 2
min_artists_per_playlist = 2

In [12]:
song_relationships = {}

t = 0
iteration_times = []

for idx, path in enumerate(fullpaths):
    start_time = time.time()
    if idx % 5 == 0 and idx > 0:
        print(f"processed {idx-5}-{idx} - time taken {t:.2f}")
        iteration_times.append(t)
        t = 0
    with open(path) as f:
        mpd_slice = json.load(f)
    playlists_data = mpd_slice['playlists']

    for idx, playlist in enumerate(playlists_data):
        r = playlist['num_albums'] / playlist['num_tracks']
        if r < 0.0:
            continue
        songs = set(track['track_uri'] for track in playlist['tracks'])
        t_per_p = len(songs)
        albums = set(track['album_uri'] for track in playlist['tracks'])
        alb_per_p = len(albums)
        artists = set(track['artist_uri'] for track in playlist['tracks'])
        art_per_p = len(artists)

        if (min_tracks_per_playlist >= t_per_p) or \
             (t_per_p >= max_tracks_per_playlist) or \
                (min_albums_per_playlist >= alb_per_p) or \
                    (min_artists_per_playlist >= art_per_p):
            continue

        for song in songs:
            if song not in song_relationships:
                song_relationships[song] = {}
                
            for related_song in songs:
                if related_song != song:  # avoid self-relationship
                    if related_song in song_relationships[song]:
                        song_relationships[song][related_song] += 1
                    else:
                        song_relationships[song][related_song] = 1
    end_time = time.time()
    t += end_time - start_time

print(f'Number of songs processed: {len(song_relationships)}')


processed 0-5 - time taken 6.13
processed 5-10 - time taken 6.36
processed 10-15 - time taken 6.98
processed 15-20 - time taken 6.95
processed 20-25 - time taken 7.43
Number of songs processed: 273051


In [13]:
song_data_map = {}
for idx, path in enumerate(fullpaths):
    if idx % 5 == 0 and idx > 0:
        print(f"Processed {idx-5}-{idx}")
    with open(path) as f:
        mpd_slice = json.load(f)
    playlists_data = mpd_slice['playlists']
    for playlist in playlists_data:
        for track in playlist['tracks']:
            song_uri = track['track_uri']
            song_name = track['track_name']
            album_name = track['album_name']
            artist_name = track['artist_name']
            if song_uri in song_relationships:
                song_data_map[song_uri] = {'song_name': song_name, 'album_name': album_name, 'artist_name': artist_name}


Processed 0-5
Processed 5-10
Processed 10-15
Processed 15-20
Processed 20-25


In [None]:
# song_relationships_sorted = {song: sorted(relations.items(), key=lambda x: x[1], reverse=True)
#                              for song, relations in tqdm(song_relationships.items())}


In [14]:
song_relationships = {key: list(value.items()) for key, value in tqdm(song_relationships.items())}

100%|██████████| 273051/273051 [00:33<00:00, 8224.51it/s] 


In [15]:
max_connections = 0
song_with_most_connections = None

for song, connections in song_relationships.items():
    num_connections = len(connections)
    if num_connections > max_connections:
        max_connections = num_connections
        song_with_most_connections = song

print("Song with the most connections:", song_data_map[song_with_most_connections])
print("Number of connections:", max_connections)


Song with the most connections: {'song_name': 'Closer', 'album_name': 'Closer', 'artist_name': 'The Chainsmokers'}
Number of connections: 18765


In [16]:
song_indices = {song_uri: idx for idx, song_uri in enumerate(song_relationships.keys())}
num_songs = len(song_relationships)

row_indices = []
col_indices = []
data_sum = []
data = []

for idx, (song_uri, relationships) in enumerate(song_relationships.items()):
    row_idx = song_indices[song_uri]
    sum_connections = np.sum([x[1] for x in relationships])
    for related_song_uri, count in relationships:
        col_idx = song_indices[related_song_uri]
        row_indices.append(row_idx)
        col_indices.append(col_idx)
        data_sum.append(count / sum_connections)
        data.append(count)

    if idx % 30000 == 0 and idx > 0:
        print(f"Processed {idx} rows.....")

transition_matrix_sum = csr_matrix((data_sum, (row_indices, col_indices)), shape=(num_songs, num_songs), dtype=np.float64)
transition_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(num_songs, num_songs), dtype=np.float64)


Processed 30000 rows.....
Processed 60000 rows.....
Processed 90000 rows.....
Processed 120000 rows.....
Processed 150000 rows.....
Processed 180000 rows.....
Processed 210000 rows.....
Processed 240000 rows.....
Processed 270000 rows.....


In [40]:
n = 10

popularity = np.sum(transition_matrix_sum, axis=0)
top_n_indices = np.argsort(popularity.A1)[-n:]

init = np.random.rand(1, num_songs)
init = init / np.sum(init)
probs = [init]
p = csr_matrix(init)

damping = True
if damping:
    damping_factor = 0.85
    random_jump_vector = csr_matrix(np.ones(num_songs)/num_songs)
    for i in tqdm(range(30)):
        p = damping_factor * np.dot(p, transition_matrix_sum) + (1 - damping_factor) * random_jump_vector
        probs.append(p)
else:
    for i in tqdm(range(30)):
        p = np.dot(p, transition_matrix_sum)
        probs.append(p)


plot_data = []
for i in top_n_indices:
    song_name = song_data_map[list(song_relationships)[i]]["song_name"]
    for step_num, step in enumerate(probs):
        plot_data.append({'Iteration': step_num, 'Probability': step[0, i], 'Song': song_name})

df = pd.DataFrame(plot_data)
fig = px.line(df, x='Iteration', y='Probability', color='Song', title='Convergence of Most Popular Songs')
fig.show()

100%|██████████| 30/30 [00:07<00:00,  3.96it/s]


In [22]:
A = transition_matrix.copy()

In [20]:
def user_playlist_vector(playlist_songs, song_indices, num_songs):
    user_vector = np.zeros(num_songs)
    for song in playlist_songs:
        user_vector[song_indices[song]] = 1/len(playlist_songs)
    return user_vector

def get_recommendation_vector(user_vector, transition_matrix, steps, damping = True, damping_factor = 0.85):
    p = user_vector
    p = csr_matrix(p)
    if damping:
        random_jump_vector = csr_matrix(np.ones(num_songs) / num_songs)
        for _ in range(steps):
            p = damping_factor * np.dot(p, transition_matrix) + (1 - damping_factor) * random_jump_vector
    else:
        for _ in range(steps):
            p = np.dot(p, transition_matrix)
    return p.toarray()[0]

def top_n_recommendations(recommendation_vector, song_data_map, n):
    top_indices = np.argsort(recommendation_vector)[-n:]
    top_songs = [song_data_map[list(song_relationships)[i]] for i in top_indices]
    return top_songs

def recommend_songs(user_playlist, song_indices, transition_matrix, song_data_map, steps, n=10):
    user_vector = user_playlist_vector(user_playlist, song_indices, len(song_indices))
    recommendation_vector = get_recommendation_vector(user_vector, transition_matrix, steps, damping = True, damping_factor = 0.85)
    return top_n_recommendations(recommendation_vector, song_data_map, n)


In [43]:
top_n_songs = [list(song_relationships)[i] for i in top_n_indices]

In [44]:
top_n_songs

['spotify:track:5CtI0qwDJkDQGwXD1H1cLb',
 'spotify:track:3kxfsdsCpFgN412fpnW85Y',
 'spotify:track:152lZdxL1OR0ZMW6KquMif',
 'spotify:track:4WjH9Bzt3kx7z8kl0awxh4',
 'spotify:track:6O6M7pJLABmfBRoGZMu76Y',
 'spotify:track:1Slwb6dOYkBlWal1PGtnNg',
 'spotify:track:1xznGGDReH1oQq0xzbwXa3',
 'spotify:track:7KXjTSCq5nL1LoYtL7XAwS',
 'spotify:track:7BKLCZ1jbUBVqRi2FVlTVw',
 'spotify:track:7yq4Qj7cqayVTp3FF9CWbm']

In [45]:
playlist = random.sample(list(song_relationships), 1) + top_n_songs

for i, j in zip(playlist, [song_data_map[x] for x in playlist]):
    print(i, j)


recommended_songs = recommend_songs(playlist, song_indices, A, song_data_map, 10, n=3)
for song in recommended_songs:
    print(f"Song: {song['song_name']}\nAlbum: {song['album_name']}\nArtist: {song['artist_name']}\n{'-'*40}")    

spotify:track:3pEY4jsSSK9XMvMITzqv9X {'song_name': 'Mood', 'album_name': 'Love Songs for the Streets', 'artist_name': 'Lil Durk'}
spotify:track:5CtI0qwDJkDQGwXD1H1cLb {'song_name': 'Despacito - Remix', 'album_name': 'Despacito Feat. Justin Bieber', 'artist_name': 'Luis Fonsi'}
spotify:track:3kxfsdsCpFgN412fpnW85Y {'song_name': 'Redbone', 'album_name': '"Awaken, My Love!"', 'artist_name': 'Childish Gambino'}
spotify:track:152lZdxL1OR0ZMW6KquMif {'song_name': 'Location', 'album_name': 'American Teen', 'artist_name': 'Khalid'}
spotify:track:4WjH9Bzt3kx7z8kl0awxh4 {'song_name': 'Lean On (feat. MØ & DJ Snake)', 'album_name': 'Peace Is The Mission', 'artist_name': 'Major Lazer'}
spotify:track:6O6M7pJLABmfBRoGZMu76Y {'song_name': 'Roses', 'album_name': 'The Chainsmokers- Japan Special Edition', 'artist_name': 'The Chainsmokers'}
spotify:track:1Slwb6dOYkBlWal1PGtnNg {'song_name': 'Thinking Out Loud', 'album_name': 'x', 'artist_name': 'Ed Sheeran'}
spotify:track:1xznGGDReH1oQq0xzbwXa3 {'song_na

In [None]:
import tkinter as tk
from tkinter import ttk

def search_songs(query):
    # Filter songs based on query
    filtered_songs = [name for name in song_data_map if query.lower() in song_data_map[name]['song_name'].lower()]
    return filtered_songs[:10]  # return top 10 matches

def on_search(event):
    # Event handler for song search
    query = search_entry.get()
    results = search_songs(query)
    
    # Update the listbox with search results
    listbox.delete(0, tk.END)
    for song in results:
        listbox.insert(tk.END, song_data_map[song]['song_name'])

def get_recommendations():
    # Get selected songs from listbox
    selected_songs = [listbox.get(idx) for idx in listbox.curselection()]
    
    recommended_songs = recommend_songs(selected_songs, song_indices, transition_matrix, song_data_map, n=10)
    
    # Update the listbox with recommended songs
    listbox.delete(0, tk.END)
    for song in recommended_songs:
        listbox.insert(tk.END, song)

# Set up the main window
root = tk.Tk()
root.title("Song Recommendation System")

# Search entry
search_entry = ttk.Entry(root)
search_entry.pack(pady=20)
search_entry.bind('<KeyRelease>', on_search)

# Listbox to display songs
listbox = tk.Listbox(root, selectmode=tk.MULTIPLE, width=50, height=20)
listbox.pack(pady=20)

# Recommendation button
recommend_button = ttk.Button(root, text="Get Recommendations", command=get_recommendations)
recommend_button.pack(pady=20)

root.mainloop()
