In [12]:
import sqlite3
import pandas as pd
from sklearn.decomposition import TruncatedSVD

# Connect to the SQLite database
conn = sqlite3.connect('../data_storage/spotify.db')

# SQL query to retrieve playlist tracks with track_uri, track_name, and pid
query = '''
    SELECT pt.pid, pt.track_uri, t.track_name
    FROM playlist_track pt
    JOIN track t ON pt.track_uri = t.track_uri
    WHERE pt.pid IN (SELECT pid FROM playlist LIMIT 5000)
'''

playlist_tracks = pd.read_sql_query(query, conn)
print(playlist_tracks)

conn.close()

co_occurrence_matrix = pd.crosstab(playlist_tracks['track_uri'], playlist_tracks['pid'])
print(co_occurrence_matrix)

n_components = min(co_occurrence_matrix.shape)  
print(n_components)
svd = TruncatedSVD(n_components=n_components, random_state=42)
song_embeddings = svd.fit_transform(co_occurrence_matrix)
print(song_embeddings)
# Display the shape of the resulting song embeddings
print("Shape of Song Embeddings:", song_embeddings.shape)



         pid                             track_uri  \
0       2093  spotify:track:000VZqvXwT0YNqKk7iG2GS   
1        371  spotify:track:000mA0etY38nKdvf1N04af   
2        182  spotify:track:000xQL6tZNLJzIrtIgxqSl   
3        813  spotify:track:000xQL6tZNLJzIrtIgxqSl   
4       1011  spotify:track:000xQL6tZNLJzIrtIgxqSl   
...      ...                                   ...   
330338   467  spotify:track:7zz1drChhd4hQBiGSnLRBZ   
330339  1006  spotify:track:7zzAWJ8aD50WH1EjGC2j45   
330340   454  spotify:track:7zzBEZBTJejWeL6EqWmCD9   
330341  3486  spotify:track:7zzBEZBTJejWeL6EqWmCD9   
330342  3809  spotify:track:7zzLt6Z9y7jMvXnEg00n58   

                              track_name  
0                                  Mercy  
1       If I Gave Myself To Someone Else  
2                         Still Got Time  
3                         Still Got Time  
4                         Still Got Time  
...                                  ...  
330338                      Od Yihye Tov  
330339 

In [None]:
from sklearn.cluster import KMeans
# Perform K-means clustering on song embeddings
num_clusters = 25  # Example: Number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(song_embeddings)

# Filter playlist_tracks to include only relevant 'pid' values
valid_pids = co_occurrence_matrix.index.unique()
playlist_tracks_subset = playlist_tracks.drop_duplicates(subset='track_uri')

# Reset the index of the subset DataFrame
playlist_tracks_subset.reset_index(drop=True, inplace=True)
# Display original playlist_tracks DataFrame
print("\nOriginal playlist_tracks DataFrame:")
print(playlist_tracks)

# Display filtered playlist_tracks_subset DataFrame
print("\nFiltered playlist_tracks_subset DataFrame:")
print(playlist_tracks_subset)

# Map track names to cluster labels
playlist_tracks_subset['cluster_label'] = cluster_labels

print(playlist_tracks_subset)

# Print track names within each cluster
for cluster_id in range(num_clusters):
    print(f"\nCluster {cluster_id}:")
    tracks_in_cluster = playlist_tracks_subset[playlist_tracks_subset['cluster_label'] == cluster_id]['track_name']
    for i, track_name in enumerate(tracks_in_cluster):
        if i < 10:
            print(f"- {track_name}")
        else:
            break


Original playlist_tracks DataFrame:
       pid                             track_uri  \
0      371  spotify:track:000mA0etY38nKdvf1N04af   
1      182  spotify:track:000xQL6tZNLJzIrtIgxqSl   
2      813  spotify:track:000xQL6tZNLJzIrtIgxqSl   
3      999  spotify:track:006AVH7fq061voGXkUiII4   
4      743  spotify:track:006PJvsr6CyV3JdBf7wiNF   
...    ...                                   ...   
66716  559  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
66717  940  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
66718  984  spotify:track:7zxRMhXxJMQCeDDg0rKAVo   
66719  467  spotify:track:7zz1drChhd4hQBiGSnLRBZ   
66720  454  spotify:track:7zzBEZBTJejWeL6EqWmCD9   

                             track_name  
0      If I Gave Myself To Someone Else  
1                        Still Got Time  
2                        Still Got Time  
3                            Fingertips  
4                 Crumble the Satellite  
...                                 ...  
66716                          Some Way  
66

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playlist_tracks_subset['cluster_label'] = cluster_labels
