In [2]:
import pandas as pd
import sqlite3
import numpy as np
from tqdm import tqdm
import pickle
tqdm.pandas()

In [3]:
conn = sqlite3.connect(r'C:\Users\liamb\Downloads\spotify.db')

In [4]:
sql_query = '''
    SELECT 
        artist.artist_name,
        track.track_uri, 
        track.track_name
    FROM track
    JOIN album ON track.album_uri = album.album_uri
    JOIN artist ON album.artist_uri = artist.artist_uri
'''
track_df = pd.read_sql_query(sql_query, conn)
track_df

Unnamed: 0,artist_name,track_uri,track_name
0,Nina Simone,spotify:track:2HDwGSgOBJD9lE3BF5JEec,Sinnerman - Live In New York/1965
1,Nina Simone,spotify:track:61S0mEfdRtJ62Fc7fPqx0U,Feeling Good
2,Ray Charles,spotify:track:2xar08Fq5xra2KKZs5Bw9j,I've Got A Woman
3,Frank Sinatra,spotify:track:2y8Eez5cFFf2JzD546LThM,Fly Me To The Moon
4,Frank Sinatra,spotify:track:70oCQbSwh1a2iY08n3WfoD,That's Life
...,...,...,...
2262287,Ludwig van Beethoven,spotify:track:0j7J1eAab3ZxGssDi6zDS6,"Piano Sonata No.8 In C Minor, Op.13 - ""Patheti..."
2262288,Johann Sebastian Bach,spotify:track:27Lw0hWfvxWhVrC5dkGI7o,"French Suite No. 4 in E-flat Major, BWV815: i...."
2262289,London Conchord Ensemble,spotify:track:2w6qSS5stCN9CXZpqVJ2AH,"Bach Concerto in D Minor for Two Violins, BWV ..."
2262290,Maurice Ravel,spotify:track:2yE3vqnLHCvf5kxg0DliUT,Ma mere l'oye (Mother Goose) (version for orch...


In [5]:
with open(r"C:\Users\liamb\Documents\Spotify-Million-Playlist-Challenge\song_embeddings.pkl", 'rb') as f:
    embeddings_dict = pickle.load(f)

In [6]:
track_uris, embeddings = zip(*embeddings_dict.items())
track_uris = np.array(track_uris)
embeddings_matrix = np.stack([value for value in embeddings])
global_average_embedding = np.mean(embeddings_matrix, axis=0)

In [11]:
def get_closest(artist: str, track: str, n: int):
    print("Finding track uri")
    track_uri = track_df[(track_df["artist_name"] == artist) & (track_df["track_name"] == track)]["track_uri"].values[0]
    print(f"Track URI: {track_uri}")
    print("Getting track embedding")
    track_emb = embeddings_dict[track_uri]
    print("Calculating scores")
    scores = np.dot(embeddings_matrix, track_emb)
    print("Getting top n indices")
    top_n_indices = np.argpartition(scores, n)[:n]
    print("Getting top n track uris")
    top_n_track_uris = track_uris[top_n_indices].tolist()
    print("Getting top n track data using URIs")
    return track_df[track_df["track_uri"].isin(top_n_track_uris)]

In [12]:
get_closest("Taylor Swift", "Love Story", 50)

Finding track uri
Track URI: spotify:track:1vrd6UOGamcKNGnSHJQlSt
Getting track embedding
Calculating scores
Getting top n indices
Getting top n track uris
Getting top n track data using URIs


Unnamed: 0,artist_name,track_uri,track_name
28045,Ali Farka Touré,spotify:track:7cavYwUJmzqjWozykoPQFg,Ruby
137559,Jorge Ben Jor,spotify:track:5rjSKhSHGsKP2RJtbRWQkl,Take It Easy My Brother Charles
320925,The Budos Band,spotify:track:5zH2THXQhCQJhwXBIxkPCA,Black Venom
342718,Windhand,spotify:track:3bZG5LMoqBiXDwSSi2p2VN,Orchard
364702,Truckfighters,spotify:track:4RB0T9OOjyDnDhyuXKUtpH,Desert Cruiser
416755,CrossFit Junkies,spotify:track:7yAVdA9ELDHP3PNvGMxlka,Ice Ice Baby (Crossfit Mix)
578974,Tesla,spotify:track:7vYqk8gXAspGFgKXPcAytg,Make It Last
621983,Daryl Hall,spotify:track:7i0Cll8Rn5at9L6eeM7LTS,Why Was It So Easy
766818,Kevin Gordon,spotify:track:7mkln8h6YmUoqQSgw0zqat,Walking on the Levee
814086,Neal Claed,spotify:track:7mgzaZ6B86CWyQtAEzxGwN,Jack It - Radio Edit


In [13]:
import sqlite3
import pandas as pd
from gensim.models import Word2Vec

query = '''
SELECT PT.pid, PT.track_uri, PT.pos
FROM playlist_track PT
INNER JOIN (
SELECT track_uri
FROM playlist_track
GROUP BY track_uri
HAVING COUNT(pid) > 1000
) AS PT2 ON PT.track_uri = PT2.track_uri
WHERE pid < 1000000
'''
playlist_tracks = pd.read_sql_query(query, conn)
playlists = playlist_tracks.groupby('pid')['track_uri'].apply(list).tolist()

KeyboardInterrupt: 

In [None]:
model = Word2Vec(sentences=playlists, vector_size=100, window=5, min_count=1, sg=1)
track_embeddings = {track_uri: model.wv[track_uri] for track_uri in playlist_tracks['track_uri'].unique()}

In [None]:
track_uris, embeddings = zip(*track_embeddings.items())
track_uris = np.array(track_uris)
embeddings_matrix = np.stack([value for value in embeddings])
global_average_embedding = np.mean(embeddings_matrix, axis=0)

In [None]:
def get_closest(artist: str, track: str, n: int):
    print("Finding track uri")
    track_uri = track_df[(track_df["artist_name"] == artist) & (track_df["track_name"] == track)]["track_uri"].values[0]
    print(f"Track URI: {track_uri}")
    print("Getting track embedding")
    track_emb = track_embeddings[track_uri]
    print("Calculating scores")
    scores = np.dot(embeddings_matrix, track_emb)
    print("Getting top n indices")
    top_n_indices = np.argpartition(scores, n)[:n]
    print("Getting top n track uris")
    top_n_track_uris = track_uris[top_n_indices].tolist()
    print("Getting top n track data using URIs")
    return track_df[track_df["track_uri"].isin(top_n_track_uris)]

get_closest("Taylor Swift", "Love Story", 50)

In [3]:
sql_query = '''
    SELECT 
        playlist.pid, 
        playlist.playlist_name,
        playlist.num_tracks 
    FROM playlist
'''
df = pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,pid,playlist_name,num_tracks
0,0,Throwbacks,52
1,1,Awesome Playlist,39
2,2,korean,64
3,3,mat,126
4,4,90s,17
...,...,...,...
999995,999995,Praise,32
999996,999996,Worship,122
999997,999997,Sex,24
999998,999998,✝️,11


In [4]:
nlp = spacy.load('en_core_web_lg')

def word2vec(word: str):
    return nlp(word).vector

In [5]:
# convert the name of the playlist into an embedding
def name_to_vec(name: str):
    return np.mean([word2vec(word) for word in name.split()], axis=0)

In [6]:
df = df[df["num_tracks"] > 5]
len(df)

994587

In [7]:
df = df.sample(10_000)

In [8]:
df["emb"] = df["playlist_name"].progress_apply(name_to_vec)

100%|██████████| 10000/10000 [01:15<00:00, 133.01it/s]


In [9]:
df

Unnamed: 0,pid,playlist_name,num_tracks,emb
995463,995463,Classic Rock,174,"[-2.41615, -2.206575, 3.4168, -0.4584, 4.15445..."
566390,566390,Sweat like the Pros Playlist,31,"[-0.92302406, -0.7314321, -0.06482003, 0.73863..."
584832,584832,Freedom,16,"[0.23445, -1.5379, -0.79468, 2.3486, 4.5527, -..."
367739,367739,alexa,57,"[1.5555, 1.3354, -0.8374, 0.63024, -2.3189, 0...."
40116,40116,Te Amo,114,"[-0.177225, 3.1457, 4.102245, -5.8094997, -3.2..."
...,...,...,...,...
201185,201185,background,25,"[-0.65939, -1.4118, -2.5557, 0.61374, 1.7389, ..."
833628,833628,My Songs,204,"[2.31425, -0.36640006, -5.976115, -4.87505, 3...."
783545,783545,Love,33,"[-0.83787, -1.9779, 1.3557, -1.231, 3.4842, -3..."
596550,596550,My Country,19,"[3.691945, 1.078375, -4.4241, -4.7962503, 4.36..."


In [11]:
embeddings = df["playlist_name"].progress_apply(name_to_vec)

100%|██████████| 10000/10000 [01:14<00:00, 134.63it/s]


In [None]:
embeddings.numpy()

In [10]:
df.iloc[100]

pid                                                         479230
playlist_name                                                greys
num_tracks                                                      12
emb              [-1.7195, 1.4451, -2.6019, 0.23826, -1.6756, -...
Name: 479230, dtype: object