In [1]:
import sqlite3
import pandas as pd
from sklearn.decomposition import TruncatedSVD

# Connect to the SQLite database
conn = sqlite3.connect('../data_storage/spotify.db')

# SQL query to retrieve playlist tracks with track_uri, track_name, and pid
query = '''
    SELECT pt.pid, pt.track_uri, t.track_name
    FROM playlist_track pt
    JOIN track t ON pt.track_uri = t.track_uri
    WHERE pt.pid IN (SELECT pid FROM playlist LIMIT 2500)
'''

playlist_tracks = pd.read_sql_query(query, conn)
print(playlist_tracks)

conn.close()

co_occurrence_matrix = pd.crosstab(playlist_tracks['track_uri'], playlist_tracks['pid'])
print(co_occurrence_matrix)

n_components = min(co_occurrence_matrix.shape)  
print(n_components)
svd = TruncatedSVD(n_components=n_components, random_state=42)
song_embeddings = svd.fit_transform(co_occurrence_matrix)
print(song_embeddings)
# Display the shape of the resulting song embeddings
print("Shape of Song Embeddings:", song_embeddings.shape)



         pid                             track_uri  \
0       2093  spotify:track:000VZqvXwT0YNqKk7iG2GS   
1        371  spotify:track:000mA0etY38nKdvf1N04af   
2        182  spotify:track:000xQL6tZNLJzIrtIgxqSl   
3        813  spotify:track:000xQL6tZNLJzIrtIgxqSl   
4       1011  spotify:track:000xQL6tZNLJzIrtIgxqSl   
...      ...                                   ...   
165575  2355  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
165576  2444  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
165577   467  spotify:track:7zz1drChhd4hQBiGSnLRBZ   
165578  1006  spotify:track:7zzAWJ8aD50WH1EjGC2j45   
165579   454  spotify:track:7zzBEZBTJejWeL6EqWmCD9   

                              track_name  
0                                  Mercy  
1       If I Gave Myself To Someone Else  
2                         Still Got Time  
3                         Still Got Time  
4                         Still Got Time  
...                                  ...  
165575                          Some Way  
165576 

In [2]:
from sklearn.cluster import KMeans

num_clusters = 25  
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(song_embeddings)

valid_pids = co_occurrence_matrix.index.unique()
playlist_tracks_subset = playlist_tracks.drop_duplicates(subset='track_uri')

playlist_tracks_subset.reset_index(drop=True, inplace=True)
print("\nOriginal playlist_tracks DataFrame:")
print(playlist_tracks)

print("\nFiltered playlist_tracks_subset DataFrame:")
print(playlist_tracks_subset)

playlist_tracks_subset['cluster_label'] = cluster_labels

print(playlist_tracks_subset)

for cluster_id in range(num_clusters):
    print(f"\nCluster {cluster_id}:")
    tracks_in_cluster = playlist_tracks_subset[playlist_tracks_subset['cluster_label'] == cluster_id]['track_name']
    for i, track_name in enumerate(tracks_in_cluster):
        if i < 10:
            print(f"- {track_name}")
        else:
            break


Original playlist_tracks DataFrame:
         pid                             track_uri  \
0       2093  spotify:track:000VZqvXwT0YNqKk7iG2GS   
1        371  spotify:track:000mA0etY38nKdvf1N04af   
2        182  spotify:track:000xQL6tZNLJzIrtIgxqSl   
3        813  spotify:track:000xQL6tZNLJzIrtIgxqSl   
4       1011  spotify:track:000xQL6tZNLJzIrtIgxqSl   
...      ...                                   ...   
165575  2355  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
165576  2444  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
165577   467  spotify:track:7zz1drChhd4hQBiGSnLRBZ   
165578  1006  spotify:track:7zzAWJ8aD50WH1EjGC2j45   
165579   454  spotify:track:7zzBEZBTJejWeL6EqWmCD9   

                              track_name  
0                                  Mercy  
1       If I Gave Myself To Someone Else  
2                         Still Got Time  
3                         Still Got Time  
4                         Still Got Time  
...                                  ...  
165575       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playlist_tracks_subset['cluster_label'] = cluster_labels


In [3]:
for cluster_id in range(num_clusters):
    print(f"\nCluster {cluster_id}:")
    tracks_in_cluster = playlist_tracks_subset[playlist_tracks_subset['cluster_label'] == cluster_id]['track_name']
    for i, track_name in enumerate(tracks_in_cluster):
        if i < 25:
            print(f"- {track_name}")
        else:
            break


Cluster 0:
- PILLOWTALK
- oui
- Don't Let Me Down
- This Is What You Came For
- 2 Phones
- Too Good
- Never Be Like You
- Work
- I Took A Pill In Ibiza - Seeb Remix
- Needed Me
- My Way (feat. Monty)
- Antidote
- One Dance
- The Hills
- Jumpman
- Cheap Thrills
- Hotline Bling
- Flex (Ooh, Ooh, Ooh)
- Ride
- Cake By The Ocean
- Trap Queen
- Hands To Myself
- Stressed Out
- Again
- Love Yourself

Cluster 1:
- Divine Romance
- Be Thou My Vision
- 'tis So Sweet To Trust In Jesus
- You Are Mine (Isaiah 43) [Remastered]
- Oh God
- Jesus Paid It All
- The Love of God
- Empty My Soul
- Explode My Soul
- What Our God Has Done
- You're Never Giving Up
- How Deep the Father's Love
- Hallelujah! What a Savior
- Arise
- In Christ Alone
- All Creatures
- Lily's Song (Praise The Lord) - Live
- Light Will Dawn
- Benediction
- It Amazes Me (Live)
- In the Family
- Worship Echos (Spoken Message)
- Come Like the Wind
- Wreckingball
- Beautiful (feat. Jake Stevens)

Cluster 2:
- Mercy
- If I Gave Myself 