In [3]:
import pandas as pd
import numpy as np
import random

# Load your dataset (assuming it's a CSV file)
data = pd.read_csv('dataset.csv')

# Group by 'artists' and calculate the mean of the other columns
data_grouped = data.groupby('artists').agg(
    mean_danceability=('danceability', 'mean'),
    mean_energy=('energy', 'mean'),
    mean_acousticness=('acousticness', 'mean'),
    mean_instrumentalness=('instrumentalness', 'mean'),
    mean_liveness=('liveness', 'mean'),
    mean_speechiness=('speechiness', 'mean'),
    mean_valence=('valence', 'mean')
).reset_index()

# Create the new data point (cluster center, what user will "input")
new_data = {
    'mean_danceability': 0.8, 
    'mean_energy': 0.7, 
    'mean_acousticness': 0.1, 
    'mean_instrumentalness': 0.05, 
    'mean_liveness': 0.4, 
    'mean_speechiness': 0.2, 
    'mean_valence': 0.6
}

# Extract the predictors from the grouped data
grouped_predictors = data_grouped[[
    'mean_danceability', 'mean_energy', 'mean_acousticness', 
    'mean_instrumentalness', 'mean_liveness', 'mean_speechiness', 'mean_valence'
]]

# Calculate Euclidean distances between each artist's feature values and the new_data
distances = np.sqrt(((grouped_predictors - pd.Series(new_data)) ** 2).sum(axis=1))

# Add the distances to the original data
data_grouped['distance_to_new_data'] = distances

# Sort the data by the distance and return the top 20 closest artists
top_20_closest_artists = data_grouped.sort_values(by='distance_to_new_data').head(20)

# View the result
display(top_20_closest_artists)

# Select a random artist from the 'artists' column of top 20 closest artists
random_artist = random.choice(top_20_closest_artists['artists'].values)

# Create a dataframe filtering for just the selected artist
artist_songs = data[data['artists'] == random_artist]

# Select predictors from artist's songs
song_predictors = artist_songs[[
    'danceability', 'energy', 'acousticness', 
    'instrumentalness', 'liveness', 'speechiness', 'valence'
]]

# Calculate Euclidean distances between each song's feature values and the new_data (user input)
song_distances = np.sqrt(((song_predictors - pd.Series(new_data)) ** 2).sum(axis=1))

# Add the distances to the original data
artist_songs.loc[:, 'distance_to_new_data'] = song_distances

# Sort the data by the distance and return the top song
top_song = artist_songs.sort_values(by='distance_to_new_data').head(1)

# View the result
display(top_song)

Unnamed: 0,artists,mean_danceability,mean_energy,mean_acousticness,mean_instrumentalness,mean_liveness,mean_speechiness,mean_valence,distance_to_new_data
26912,That Girl Lay Lay;Sauce Walka,0.816,0.769,0.102,0.0,0.365,0.167,0.62,0.101168
27887,Thug Life,0.802,0.728,0.1805,0.0,0.387,0.208,0.6275,0.103718
4668,Cardi B;Bad Bunny;J Balvin,0.8155,0.7265,0.1,0.0,0.372,0.1285,0.6495,0.108577
4222,Bruno Mars;R3HAB,0.764,0.708,0.146,0.0,0.33,0.21,0.566,0.110145
13646,Jordan Sandhu;Zikar Sandhu,0.763,0.735,0.154,0.0,0.391,0.185,0.532,0.113754
8029,Dub Pistols;Horseman,0.743,0.749,0.0303,0.0907,0.399,0.194,0.544,0.123845
21353,Pete Rock;C.L. Smooth,0.755,0.683,0.141,0.0,0.342,0.166,0.668,0.125056
13249,Jhayco;Tainy,0.777,0.663,0.11,0.0,0.302,0.244,0.583,0.127777
25104,Sidhu Moose Wala;Tion Wayne,0.714,0.72,0.102,0.0,0.344,0.136,0.592,0.13265
3664,Black Rob,0.737,0.705,0.0579,0.0,0.321,0.24,0.557,0.134002


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_songs.loc[:, 'distance_to_new_data'] = song_distances


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,distance_to_new_data
20656,20656,5jGbxKNLOEWZoOmVpcYbJO,Wiz Khalifa,Hip Hop Vibes,Memory Lane,0,166773,True,0.789,0.71,...,0,0.353,0.37,0.0,0.155,0.826,95.178,4,dance,0.0
