In [16]:
import pandas as pd
import sqlite3
import numpy as np
import spacy
from tqdm import tqdm
import pickle
tqdm.pandas()

In [2]:
conn = sqlite3.connect('C:/spotify.db')

In [22]:
sql_query = '''
    SELECT 
        artist.artist_name,
        track.track_uri, 
        track.track_name
    FROM track
    JOIN album ON track.album_uri = album.album_uri
    JOIN artist ON album.artist_uri = artist.artist_uri
'''
track_df = pd.read_sql_query(sql_query, conn)
track_df

Unnamed: 0,artist_name,track_uri,track_name
0,Nina Simone,spotify:track:2HDwGSgOBJD9lE3BF5JEec,Sinnerman - Live In New York/1965
1,Nina Simone,spotify:track:61S0mEfdRtJ62Fc7fPqx0U,Feeling Good
2,Ray Charles,spotify:track:2xar08Fq5xra2KKZs5Bw9j,I've Got A Woman
3,Frank Sinatra,spotify:track:2y8Eez5cFFf2JzD546LThM,Fly Me To The Moon
4,Frank Sinatra,spotify:track:70oCQbSwh1a2iY08n3WfoD,That's Life
...,...,...,...
2262287,Ludwig van Beethoven,spotify:track:0j7J1eAab3ZxGssDi6zDS6,"Piano Sonata No.8 In C Minor, Op.13 - ""Patheti..."
2262288,Johann Sebastian Bach,spotify:track:27Lw0hWfvxWhVrC5dkGI7o,"French Suite No. 4 in E-flat Major, BWV815: i...."
2262289,London Conchord Ensemble,spotify:track:2w6qSS5stCN9CXZpqVJ2AH,"Bach Concerto in D Minor for Two Violins, BWV ..."
2262290,Maurice Ravel,spotify:track:2yE3vqnLHCvf5kxg0DliUT,Ma mere l'oye (Mother Goose) (version for orch...


In [20]:
with open(r"C:\Users\Sam\spotify\song_embeddings.pkl", 'rb') as f:
    embeddings_dict = pickle.load(f)

In [26]:
embeddings_dict["spotify:track:1vrd6UOGamcKNGnSHJQlSt"]

array([-0.36927438,  0.9229998 ,  0.78838503,  0.50456417, -0.5490132 ,
       -1.0426029 ,  0.35952175, -0.608865  ,  0.10243092,  0.2649767 ,
        0.03310614,  0.31028834, -0.7059179 ,  1.1078572 , -0.16028902,
       -0.931207  , -0.08627288, -1.2794883 ,  0.36061698, -0.15389757,
        1.3357977 , -1.2118173 ,  0.49112654, -0.86701137, -0.55779487,
        0.8887201 , -1.0586736 ,  0.7145269 ,  0.30209488, -0.96451336,
        0.10157688, -0.826951  ,  1.2119412 , -0.24279037, -0.44545886,
        0.6405409 , -1.0130908 , -0.6756321 , -1.1218574 , -0.7544207 ,
        0.21352383,  1.1315243 ,  0.87437296,  0.08632185,  0.19946931,
        0.5584908 , -0.32062554, -0.99118674, -0.45849773,  0.6986598 ,
       -0.71683717,  0.1918522 ,  0.46195802, -0.64970446, -0.6656138 ,
       -0.11017503, -0.39030185, -0.32123494,  0.31253478,  0.9219208 ,
       -0.72206265, -0.54723114, -0.00992155,  0.14516163], dtype=float32)

In [28]:
track_uris, embeddings = zip(*embeddings_dict.items())
track_uris = np.array(track_uris)
embeddings_matrix = np.stack([value for value in embeddings])
global_average_embedding = np.mean(embeddings_matrix, axis=0)

In [34]:
def get_closest(artist: str, track: str, n: int):
    track_emb = track_df[(track_df["artist_name"] == artist) & (track_df["track_name"] == track)]["track_uri"].values[0]
    scores = np.dot(embeddings_matrix, track_emb)
    top_n_indices = np.argpartition(scores, n)[:n]
    top_n_track_uris = track_uris[top_n_indices].tolist()
    return track_df[track_df["track_uri"].isin(top_n_track_uris)]

In [35]:
get_closest("Taylor Swift", "Love Story", 50)

MemoryError: Unable to allocate 17.3 GiB for an array with shape (2262292, 64) and data type <U32

In [3]:
sql_query = '''
    SELECT 
        playlist.pid, 
        playlist.playlist_name,
        playlist.num_tracks 
    FROM playlist
'''
df = pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,pid,playlist_name,num_tracks
0,0,Throwbacks,52
1,1,Awesome Playlist,39
2,2,korean,64
3,3,mat,126
4,4,90s,17
...,...,...,...
999995,999995,Praise,32
999996,999996,Worship,122
999997,999997,Sex,24
999998,999998,✝️,11


In [4]:
nlp = spacy.load('en_core_web_lg')

def word2vec(word: str):
    return nlp(word).vector

In [5]:
# convert the name of the playlist into an embedding
def name_to_vec(name: str):
    return np.mean([word2vec(word) for word in name.split()], axis=0)

In [6]:
df = df[df["num_tracks"] > 5]
len(df)

994587

In [7]:
df = df.sample(10_000)

In [8]:
df["emb"] = df["playlist_name"].progress_apply(name_to_vec)

100%|██████████| 10000/10000 [01:15<00:00, 133.01it/s]


In [9]:
df

Unnamed: 0,pid,playlist_name,num_tracks,emb
995463,995463,Classic Rock,174,"[-2.41615, -2.206575, 3.4168, -0.4584, 4.15445..."
566390,566390,Sweat like the Pros Playlist,31,"[-0.92302406, -0.7314321, -0.06482003, 0.73863..."
584832,584832,Freedom,16,"[0.23445, -1.5379, -0.79468, 2.3486, 4.5527, -..."
367739,367739,alexa,57,"[1.5555, 1.3354, -0.8374, 0.63024, -2.3189, 0...."
40116,40116,Te Amo,114,"[-0.177225, 3.1457, 4.102245, -5.8094997, -3.2..."
...,...,...,...,...
201185,201185,background,25,"[-0.65939, -1.4118, -2.5557, 0.61374, 1.7389, ..."
833628,833628,My Songs,204,"[2.31425, -0.36640006, -5.976115, -4.87505, 3...."
783545,783545,Love,33,"[-0.83787, -1.9779, 1.3557, -1.231, 3.4842, -3..."
596550,596550,My Country,19,"[3.691945, 1.078375, -4.4241, -4.7962503, 4.36..."


In [11]:
embeddings = df["playlist_name"].progress_apply(name_to_vec)

100%|██████████| 10000/10000 [01:14<00:00, 134.63it/s]


In [None]:
embeddings.numpy()

In [10]:
df.iloc[100]

pid                                                         479230
playlist_name                                                greys
num_tracks                                                      12
emb              [-1.7195, 1.4451, -2.6019, 0.23826, -1.6756, -...
Name: 479230, dtype: object