In [1]:
import sqlite3
import pandas as pd
from sklearn.decomposition import TruncatedSVD

# Connect to the SQLite database
conn = sqlite3.connect('../data_storage/spotify.db')

# SQL query to retrieve playlist tracks with track_uri, track_name, and pid
query = '''
    SELECT pt.pid, pt.track_uri, t.track_name
    FROM playlist_track pt
    JOIN track t ON pt.track_uri = t.track_uri
    WHERE pt.pid IN (SELECT pid FROM playlist LIMIT 2500)
'''

playlist_tracks = pd.read_sql_query(query, conn)
print(playlist_tracks)

conn.close()

co_occurrence_matrix = pd.crosstab(playlist_tracks['track_uri'], playlist_tracks['pid'])
print(co_occurrence_matrix)

n_components = min(co_occurrence_matrix.shape)  
print(n_components)
svd = TruncatedSVD(n_components=n_components, random_state=42)
song_embeddings = svd.fit_transform(co_occurrence_matrix)
print(song_embeddings)
# Display the shape of the resulting song embeddings
print("Shape of Song Embeddings:", song_embeddings.shape)



         pid                             track_uri  \
0       2093  spotify:track:000VZqvXwT0YNqKk7iG2GS   
1        371  spotify:track:000mA0etY38nKdvf1N04af   
2        182  spotify:track:000xQL6tZNLJzIrtIgxqSl   
3        813  spotify:track:000xQL6tZNLJzIrtIgxqSl   
4       1011  spotify:track:000xQL6tZNLJzIrtIgxqSl   
...      ...                                   ...   
165575  2355  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
165576  2444  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
165577   467  spotify:track:7zz1drChhd4hQBiGSnLRBZ   
165578  1006  spotify:track:7zzAWJ8aD50WH1EjGC2j45   
165579   454  spotify:track:7zzBEZBTJejWeL6EqWmCD9   

                              track_name  
0                                  Mercy  
1       If I Gave Myself To Someone Else  
2                         Still Got Time  
3                         Still Got Time  
4                         Still Got Time  
...                                  ...  
165575                          Some Way  
165576 

In [2]:
from sklearn.cluster import KMeans

num_clusters = 50  
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(song_embeddings)

valid_pids = co_occurrence_matrix.index.unique()
playlist_tracks_subset = playlist_tracks.drop_duplicates(subset='track_uri')

playlist_tracks_subset.reset_index(drop=True, inplace=True)
print("\nOriginal playlist_tracks DataFrame:")
print(playlist_tracks)

print("\nFiltered playlist_tracks_subset DataFrame:")
print(playlist_tracks_subset)

playlist_tracks_subset['cluster_label'] = cluster_labels

print(playlist_tracks_subset)

for cluster_id in range(num_clusters):
    print(f"\nCluster {cluster_id}:")
    tracks_in_cluster = playlist_tracks_subset[playlist_tracks_subset['cluster_label'] == cluster_id]['track_name']
    for i, track_name in enumerate(tracks_in_cluster):
        if i < 10:
            print(f"- {track_name}")
        else:
            break


Original playlist_tracks DataFrame:
         pid                             track_uri  \
0       2093  spotify:track:000VZqvXwT0YNqKk7iG2GS   
1        371  spotify:track:000mA0etY38nKdvf1N04af   
2        182  spotify:track:000xQL6tZNLJzIrtIgxqSl   
3        813  spotify:track:000xQL6tZNLJzIrtIgxqSl   
4       1011  spotify:track:000xQL6tZNLJzIrtIgxqSl   
...      ...                                   ...   
165575  2355  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
165576  2444  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
165577   467  spotify:track:7zz1drChhd4hQBiGSnLRBZ   
165578  1006  spotify:track:7zzAWJ8aD50WH1EjGC2j45   
165579   454  spotify:track:7zzBEZBTJejWeL6EqWmCD9   

                              track_name  
0                                  Mercy  
1       If I Gave Myself To Someone Else  
2                         Still Got Time  
3                         Still Got Time  
4                         Still Got Time  
...                                  ...  
165575       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playlist_tracks_subset['cluster_label'] = cluster_labels


In [3]:
for cluster_id in range(num_clusters):
    print(f"\nCluster {cluster_id}:")
    tracks_in_cluster = playlist_tracks_subset[playlist_tracks_subset['cluster_label'] == cluster_id]['track_name']
    for i, track_name in enumerate(tracks_in_cluster):
        if i < 25:
            print(f"- {track_name}")
        else:
            break


Cluster 0:
- Satisfy You
- One More
- Don't Let Go (Love)
- Making Love (Into the Night)
- Unpretty
- Runnin' (Lose It All)
- Like I Do
- Again
- Stormy Monday
- A Change Is Gonna Come
- JAY Z Blue
- Love Is My Disease
- StreetLove
- Ta en bild / Lotus
- Fall For Your Type
- I Go Sailing
- Sky's The Limit (feat. 112) - 2014 Remastered Version
- Insight XX
- Bloodstream
- Babyfather
- Teach U a Lesson
- Addicted
- Strip For You
- Hey Lover
- I Love You

Cluster 1:
- U With Me?
- Bloodstream
- Interlude
- Behind the Light
- The Heathen
- How Great (feat. Jay Electronica & My cousin Nicole)
- Get Thy Bearings
- Crew Love
- Feel Good Inc
- Transits
- Amsterdam
- Everything
- Fantasy
- Shutdown
- Gang Related
- Take Me Home - Radio Edit
- Shades Of Blue
- Ten Crack Commandments - 2014 Remastered Version
- Favorite Color
- By Any Means
- Black Lip Bastard (Rmx) (feat. Black Hippy)
- Flex
- Frank's Track
- Hedron
- I Told You/ Another One

Cluster 2:
- Apologies Are For The Weak
- Blank Spac

In [4]:
import sqlite3
import pandas as pd
from gensim.models import Word2Vec

conn = sqlite3.connect('../data_storage/spotify.db')

query = '''
    SELECT pt.pid, pt.track_uri, t.track_name
    FROM playlist_track pt
    JOIN track t ON pt.track_uri = t.track_uri
    WHERE pt.pid IN (SELECT pid FROM playlist LIMIT 500000)
'''

playlist_tracks = pd.read_sql_query(query, conn)
conn.close()

playlists = playlist_tracks.groupby('pid')['track_uri'].apply(list).tolist()

model = Word2Vec(sentences=playlists, vector_size=100, window=5, min_count=1, sg=1)

track_embeddings = {track_uri: model.wv[track_uri] for track_uri in playlist_tracks['track_uri'].unique()}

example_track_uri = 'example_track_uri'
if example_track_uri in track_embeddings:
    print(f"Embedding for '{example_track_uri}':")
    print(track_embeddings[example_track_uri])

print("Shape of Song Embeddings:", len(track_embeddings), model.vector_size)


Shape of Song Embeddings: 1610661 100


In [5]:

import numpy as np
embeddings_array = np.array([track_embeddings[track_uri] for track_uri in playlist_tracks_subset['track_uri']])

num_clusters = 50

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings_array)

playlist_tracks_subset['cluster_label'] = cluster_labels

for cluster_id in range(num_clusters):
    print(f"\nCluster {cluster_id}:")
    tracks_in_cluster = playlist_tracks_subset[playlist_tracks_subset['cluster_label'] == cluster_id]['track_name']
    for i, track_name in enumerate(tracks_in_cluster):
        if i < 10:
            print(f"- {track_name}")
        else:
            break



Cluster 0:
- If I Gave Myself To Someone Else
- LLorándote
- Crumble the Satellite
- Memory Lane
- Magic Carpet/Parting The Seas
- Nightcore This - DJ Edit
- Not so Little
- Agua Loca
- Open Fields of Grace
- El Morro

Cluster 1:
- Karate
- Turn up the Speakers - Original Mix
- The Spook Returns
- Paris
- Somebody To Love - Radio Edit
- Emergency
- Paralyzed
- Pillz
- Crossroad (feat. Danyka Nadeau)
- The Hum - Dimitri Vegas & Like Mike Vs. Ummet Ozcan / Extended Mix

Cluster 2:
- No Effort
- Seven Million (feat. Future)
- Romeo & Juliet
- 420 Vibe
- Blow My High (Members Only)
- Take It Or Leave It
- Landslide
- No Lies (feat. Wiz Khalifa)
- No Complaints
- Faneto

Cluster 3:
- Nimble Girl
- Further Out
- Can't Do Without You
- Fireworks
- You Say I'm in Love
- Birds of a Feather, We Rock Together (feat. Antwaun Stanley)
- I'm Getting Ready
- I Get Ideas
- Naomi
- Pachuca Sunrise

Cluster 4:
- Psychic City
- Let Me Touch Your Fire
- U Don't Know - Slander Remix
- Rude
- Frontlines
- 