In [1]:
import json
import pandas as pd
import numpy as np
import os
import seaborn as sns
import re
from collections import *
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.sparse import lil_matrix




In [2]:
directory_path = 'data/raw'
filenames = sorted(os.listdir(directory_path))

In [3]:
# looking at only the first 10,000 playlists
fullpaths = [directory_path + '/' + f for f in filenames][0:20]

In [4]:
fullpaths

['data/raw/mpd.slice.0-999.json',
 'data/raw/mpd.slice.1000-1999.json',
 'data/raw/mpd.slice.10000-10999.json',
 'data/raw/mpd.slice.100000-100999.json',
 'data/raw/mpd.slice.101000-101999.json',
 'data/raw/mpd.slice.102000-102999.json',
 'data/raw/mpd.slice.103000-103999.json',
 'data/raw/mpd.slice.104000-104999.json',
 'data/raw/mpd.slice.105000-105999.json',
 'data/raw/mpd.slice.106000-106999.json',
 'data/raw/mpd.slice.107000-107999.json',
 'data/raw/mpd.slice.108000-108999.json',
 'data/raw/mpd.slice.109000-109999.json',
 'data/raw/mpd.slice.11000-11999.json',
 'data/raw/mpd.slice.110000-110999.json',
 'data/raw/mpd.slice.111000-111999.json',
 'data/raw/mpd.slice.112000-112999.json',
 'data/raw/mpd.slice.113000-113999.json',
 'data/raw/mpd.slice.114000-114999.json',
 'data/raw/mpd.slice.115000-115999.json']

In [6]:
# ratios = []
# for playlist in playlists_data:
#     r = playlist['num_albums']/playlist['num_tracks']
#     ratios.append(r)
# sns.histplot(ratios)

In [7]:
def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", " ", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

In [8]:
min_tracks_per_playlist = 5
max_tracks_per_playlist = 150
min_albums_per_playlist = 5
min_artists_per_playlist = 5

In [9]:
song_relationships = {}
uri_map = {}

for idx, path in enumerate(fullpaths):
    if idx % 5 == 0 and idx > 0:
        print(f"Processed {idx-5}-{idx}")  # Print the range every 5 chunks
    f = open(path)
    js = f.read()
    f.close()
    mpd_slice = json.loads(js)
    playlists_data = mpd_slice['playlists']

    for idx, playlist in enumerate(playlists_data):
        if idx % 1000 == 0 and idx > 0:
            print(f"Processed {idx - 1000}-{idx}")
        r = playlist['num_albums'] / playlist['num_tracks']
        if r < 0.40:
            continue
        songs = set(track['track_uri'] for track in playlist['tracks'])
        t_per_p = len(songs)
        albums = set(track['album_uri'] for track in playlist['tracks'])
        alb_per_p = len(albums)
        artists = set(track['artist_uri'] for track in playlist['tracks'])
        art_per_p = len(artists)

        if (min_tracks_per_playlist >= t_per_p) or \
             (t_per_p >= max_tracks_per_playlist) or \
                (min_albums_per_playlist >= alb_per_p) or \
                    (min_artists_per_playlist >= art_per_p):
            continue

        for song in songs:
            if song not in song_relationships:
                song_relationships[song] = {}
                
            for s in songs:
                if s != song:  # avoid self-relationship
                    if s in song_relationships[song]:
                        song_relationships[song][s] += 1
                    else:
                        song_relationships[song][s] = 1



Processed 0-5
Processed 5-10
Processed 10-15


In [165]:
song_data_map = {}
for idx, path in enumerate(fullpaths):
    if idx % 5 == 0 and idx > 0:
        print(f"Processed {idx-5}-{idx}")  # Print the range every 5 chunks
    f = open(path)
    js = f.read()
    f.close()
    mpd_slice = json.loads(js)
    playlists_data = mpd_slice['playlists']
    for playlist in playlists_data:
        for track in playlist['tracks']:
            song_uri = track['track_uri']
            song_name = track['track_name']
            album_name = track['album_name']
            song_data_map[song_uri] = {'song_name': song_name, 'album_name': album_name}


Processed 0-5
Processed 5-10
Processed 10-15


In [167]:
print(len(song_relationships))

186631


In [168]:
song_relationships_sorted = {song: sorted(relations.items(), key=lambda x: x[1], reverse=True)
                             for song, relations in tqdm(song_relationships.items())}


100%|██████████| 186631/186631 [00:44<00:00, 4150.12it/s] 


In [169]:
len(song_relationships_sorted)

186631

In [170]:
track_uri = list(song_relationships_sorted.keys())[5000]
print(song_data_map[track_uri], track_uri)
song_relationships_sorted[track_uri][0:5]

{'song_name': "How'm I Gonna Get Back Home", 'album_name': "He's My Brother She's My Sister EP"} spotify:track:0EsNP48ZxnuQx303EcckwI


[('spotify:track:4CL3xg1ahV85g8hsZqZyga', 1),
 ('spotify:track:62A1DpyagGOZ56048rg2aV', 1),
 ('spotify:track:1bDA2b2eFqTKcPAivi3vxT', 1),
 ('spotify:track:7aM71oOz09ccJGzsKeIfpx', 1),
 ('spotify:track:79Iq6kn5nniUKseQgc29pI', 1)]

In [172]:
max_connections = 0
song_with_most_connections = None

for song, connections in song_relationships.items():
    num_connections = len(connections)
    if num_connections > max_connections:
        max_connections = num_connections
        song_with_most_connections = song

print("Song with the most connections:", song_data_map[song_with_most_connections])
print("Number of connections:", max_connections)


Song with the most connections: {'song_name': 'Closer', 'album_name': 'Closer'}
Number of connections: 14686


In [173]:
song_indices = {song_uri: idx for idx, song_uri in enumerate(song_relationships_sorted.keys())}


In [177]:
num_songs = len(song_relationships_sorted)
transition_matrix = np.zeros((num_songs, num_songs))

for song_uri, relationships in tqdm(song_relationships_sorted.items()):
    row_idx = song_indices[song_uri]
    sum_connections = np.sum([x[1] for x in relationships])
    for related_song_uri, count in relationships:
        col_idx = song_indices[related_song_uri]
        transition_matrix[row_idx, col_idx] = count/sum_connections

 73%|███████▎  | 135796/186631 [01:02<00:09, 5123.49it/s]

: 

: 

In [138]:
popularity = np.sum(transition_matrix, axis=0)
top_n_indices = np.argsort(popularity)[-n:]


In [151]:
list(song_relationships)[1643]

'spotify:track:7BKLCZ1jbUBVqRi2FVlTVw'

In [152]:
song_data_map[list(song_relationships)[1643]]["song_name"]

'Closer'

In [154]:
import numpy as np
import plotly.express as px
import pandas as pd

# number of most popular songs to plot
n = 30

popularity = np.sum(transition_matrix, axis=0)
top_n_indices = np.argsort(popularity)[-n:]

init = np.random.rand(1, num_songs)
init = init / np.sum(init)
probs = [init]
p = init
for i in range(20):
    p = np.dot(p, transition_matrix)
    probs.append(p)

plot_data = []
for i in top_n_indices:
    song_name = song_data_map[list(song_relationships)[i]]["song_name"]
    for step_num, step in enumerate(probs):
        plot_data.append({'Iteration': step_num, 'Probability': step[0, i], 'Song': song_name})

df = pd.DataFrame(plot_data)
fig = px.line(df, x='Iteration', y='Probability', color='Song', title='Convergence of Most Popular Songs')
fig.show()
