In [38]:
import json
import pandas as pd
import numpy as np
import os
import seaborn as sns
import re
from collections import *
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.sparse import lil_matrix

In [54]:
directory_path = 'data/raw'
filenames = sorted(os.listdir(directory_path))

In [55]:
# looking at only the first 10,000 playlists
fullpaths = [directory_path + '/' + f for f in filenames][0:1]

In [56]:
fullpaths

['data/raw/mpd.slice.100000-100999.json']

In [57]:
# ratios = []
# for playlist in playlists_data:
#     r = playlist['num_albums']/playlist['num_tracks']
#     ratios.append(r)
# sns.histplot(ratios)

def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", " ", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

In [58]:
min_tracks_per_playlist = 5
max_tracks_per_playlist = 150
min_albums_per_playlist = 5
min_artists_per_playlist = 5

In [59]:
song_relationships = {}
uri_map = {}

for idx, path in enumerate(fullpaths):
    if idx % 5 == 0 and idx > 0:
        print(f"Processed {idx-5}-{idx}")  # Print the range every 5 chunks
    f = open(path)
    js = f.read()
    f.close()
    mpd_slice = json.loads(js)
    playlists_data = mpd_slice['playlists']

    for idx, playlist in enumerate(playlists_data):
        if idx % 1000 == 0 and idx > 0:
            print(f"Processed {idx - 1000}-{idx}")
        r = playlist['num_albums'] / playlist['num_tracks']
        if r < 0.40:
            continue
        songs = set(track['track_uri'] for track in playlist['tracks'])
        t_per_p = len(songs)
        albums = set(track['album_uri'] for track in playlist['tracks'])
        alb_per_p = len(albums)
        artists = set(track['artist_uri'] for track in playlist['tracks'])
        art_per_p = len(artists)

        if (min_tracks_per_playlist >= t_per_p) or \
             (t_per_p >= max_tracks_per_playlist) or \
                (min_albums_per_playlist >= alb_per_p) or \
                    (min_artists_per_playlist >= art_per_p):
            continue

        for song in songs:
            if song not in song_relationships:
                song_relationships[song] = {}
                
            for s in songs:
                if s != song:  # avoid self-relationship
                    if s in song_relationships[song]:
                        song_relationships[song][s] += 1
                    else:
                        song_relationships[song][s] = 1

In [60]:
song_data_map = {}
for idx, path in enumerate(fullpaths):
    if idx % 5 == 0 and idx > 0:
        print(f"Processed {idx-5}-{idx}")  # Print the range every 5 chunks
    f = open(path)
    js = f.read()
    f.close()
    mpd_slice = json.loads(js)
    playlists_data = mpd_slice['playlists']
    for playlist in playlists_data:
        for track in playlist['tracks']:
            song_uri = track['track_uri']
            song_name = track['track_name']
            album_name = track['album_name']
            song_data_map[song_uri] = {'song_name': song_name, 'album_name': album_name}


In [61]:
print(len(song_relationships))

23834


In [62]:
song_relationships_sorted = {song: sorted(relations.items(), key=lambda x: x[1], reverse=True)
                             for song, relations in tqdm(song_relationships.items())}


100%|██████████| 23834/23834 [00:00<00:00, 39134.45it/s]


In [63]:
len(song_relationships_sorted)

23834

In [64]:
track_uri = list(song_relationships_sorted.keys())[5000]
print(song_data_map[track_uri], track_uri)
song_relationships_sorted[track_uri][0:5]

{'song_name': 'Battle Cry (feat. Just Blaze)', 'album_name': 'Free Agent'} spotify:track:6vysopGjRMUsSneiVZJZUw


[('spotify:track:5C82dqoMqAibC8UBAi9Arf', 1),
 ('spotify:track:6mqmYfy7i8rBqgs3e2J0d4', 1),
 ('spotify:track:2bPmZjE90uoQgA9GkuwAQ1', 1),
 ('spotify:track:5tZmLjU4vhPlJb12IweZuB', 1),
 ('spotify:track:2K6FYgz38Kzih6BGJzfD0H', 1)]

In [65]:
max_connections = 0
song_with_most_connections = None

for song, connections in song_relationships.items():
    num_connections = len(connections)
    if num_connections > max_connections:
        max_connections = num_connections
        song_with_most_connections = song

print("Song with the most connections:", song_data_map[song_with_most_connections])
print("Number of connections:", max_connections)


Song with the most connections: {'song_name': 'Mr. Brightside', 'album_name': 'Hot Fuss'}
Number of connections: 1923


In [66]:
song_indices = {song_uri: idx for idx, song_uri in enumerate(song_relationships_sorted.keys())}

In [None]:
num_songs = len(song_relationships_sorted)
transition_matrix = lil_matrix((num_songs, num_songs), dtype=np.float64)

for song_uri, relationships in tqdm(song_relationships_sorted.items()):
    row_idx = song_indices[song_uri]
    sum_connections = np.sum([x[1] for x in relationships])
    for related_song_uri, count in relationships:
        col_idx = song_indices[related_song_uri]
        transition_matrix[row_idx, col_idx] = count / sum_connections

 67%|██████▋   | 16041/23834 [00:06<00:01, 3928.31it/s]

In [41]:
n = 30

popularity = np.sum(transition_matrix, axis=0)
top_n_indices = np.argsort(popularity)[-n:]

init = np.random.rand(1, num_songs)
init = init / np.sum(init)
probs = [init]
p = init
for i in range(20):
    p = np.dot(p, transition_matrix)
    probs.append(p)

plot_data = []
for i in top_n_indices:
    song_name = song_data_map[list(song_relationships)[i]]["song_name"]
    for step_num, step in enumerate(probs):
        plot_data.append({'Iteration': step_num, 'Probability': step[0, i], 'Song': song_name})

df = pd.DataFrame(plot_data)
fig = px.line(df, x='Iteration', y='Probability', color='Song', title='Convergence of Most Popular Songs')
fig.show()


KeyboardInterrupt

