In [61]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy as scp
from scipy.stats import normaltest, anderson, yeojohnson
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    LearningCurveDisplay,
)
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    KBinsDiscretizer,
    PowerTransformer,
    MinMaxScaler,
)
from sklearn.pipeline import Pipeline

# from sklearn import svm
from sklearn.neighbors import NearestNeighbors

from scipy.spatial.distance import cosine, euclidean, cityblock, pdist

spotify_dark_green = "#1db954"
spotify_light_green = "#1ed760"
spotify_df = pd.read_csv("../data/spotify_data.csv")
spotify_df.head(30)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic
5,5,01MVOl9KtVTNfFiBU9I7dc,Tyrone Wells,Days I Will Remember,Days I Will Remember,58,214240,False,0.688,0.481,...,-8.807,1,0.105,0.289,0.0,0.189,0.666,98.017,4,acoustic
6,6,6Vc5wAMmXdKIAM7WUoEb7N,A Great Big World;Christina Aguilera,Is There Anybody Out There?,Say Something,74,229400,False,0.407,0.147,...,-8.822,1,0.0355,0.857,3e-06,0.0913,0.0765,141.284,3,acoustic
7,7,1EzrEOXmMH3G43AXT1y7pA,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,80,242946,False,0.703,0.444,...,-9.331,1,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic
8,8,0IktbUcnAGrvD03AWnz3Q8,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,74,189613,False,0.625,0.414,...,-8.7,1,0.0369,0.294,0.0,0.151,0.669,130.088,4,acoustic
9,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,...,-6.77,1,0.0295,0.426,0.00419,0.0735,0.196,78.899,4,acoustic


In [62]:
spotify_df.drop(columns=["Unnamed: 0"], inplace=True)
spotify_df.dropna(inplace=True)
spotify_df.drop_duplicates(subset=["track_id"], inplace=True)
spotify_df.shape

(89740, 20)

In [63]:
track = "To Begin Again"
scaler = MinMaxScaler()

numerical_cols = spotify_df.select_dtypes(include=np.number).columns
data_norm = pd.DataFrame(
    scaler.fit_transform(spotify_df[numerical_cols]),
    columns=numerical_cols,
    index=spotify_df["track_id"],
)
data_norm.head(20)

Unnamed: 0_level_0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5SuOikwiRyPMVoIQDJUgSV,0.73,0.042473,0.686294,0.461,0.090909,0.791392,0.0,0.148187,0.032329,1e-06,0.358,0.718593,0.361245,0.8
4qPNDBW1i3p13qLCt0Ki3A,0.55,0.026971,0.426396,0.166,0.090909,0.597377,1.0,0.079067,0.927711,6e-06,0.101,0.268342,0.318397,0.8
1iJBSr7s7jYXzM8EGcbK5b,0.57,0.038679,0.44467,0.359,0.0,0.736123,1.0,0.05772,0.210843,0.0,0.117,0.120603,0.313643,0.8
6lfxq3CG4xtTiEg7opyCyx,0.71,0.036978,0.270051,0.0596,0.0,0.573701,1.0,0.037617,0.908635,7.1e-05,0.132,0.143719,0.746758,0.6
5vjLSffimiIP26QG5WcN2K,0.82,0.036389,0.627411,0.443,0.181818,0.737103,1.0,0.054508,0.470884,0.0,0.0829,0.167839,0.492863,0.8
01MVOl9KtVTNfFiBU9I7dc,0.58,0.039332,0.698477,0.481,0.545455,0.753269,1.0,0.108808,0.290161,0.0,0.189,0.669347,0.402746,0.8
6Vc5wAMmXdKIAM7WUoEb7N,0.74,0.042231,0.413198,0.147,0.181818,0.752992,1.0,0.036788,0.860442,3e-06,0.0913,0.076884,0.580527,0.6
1EzrEOXmMH3G43AXT1y7pA,0.8,0.044822,0.713706,0.444,1.0,0.743577,1.0,0.043212,0.561245,0.0,0.0973,0.715578,0.620285,0.8
0IktbUcnAGrvD03AWnz3Q8,0.74,0.034622,0.634518,0.414,0.0,0.755249,1.0,0.038238,0.295181,0.0,0.151,0.672362,0.534523,0.8
7k9GuJYLp2AzqokyEdwEw2,0.56,0.037678,0.448731,0.632,0.090909,0.790948,1.0,0.03057,0.427711,0.00419,0.0735,0.196985,0.324191,0.8


In [64]:
track_id = spotify_df[spotify_df["track_name"] == track]["track_id"]
track_data = list(data_norm.loc[track_id.values[0]])

In [65]:
result = pd.DataFrame()
result["distance"] = [
    euclidean(object, track_data) for index, object in data_norm.iterrows()
]
result["track_id"] = data_norm.index

In [66]:
result.sort_values(by="distance", ascending=True, inplace=True)
top = result.loc[:11]
track_ids = [row["track_id"] for index, row in top.iterrows()]
top.head()

Unnamed: 0,distance,track_id
2,0.0,1iJBSr7s7jYXzM8EGcbK5b
78280,0.151903,05om7Ac9m7wKq1rHn4sHQh
89233,0.168872,5ELZpvTDGorz9BIE9zaBoZ
39832,0.186107,5EUsI3LIV042IV5ydksV9y
89128,0.189446,3lqLz5HJ7JFMcOKMNIH3Uo


In [67]:
spotify_df.set_index("track_id", inplace=True)
spotify_df.head()

Unnamed: 0_level_0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [68]:
track_list = spotify_df.loc[track_ids]
top.set_index("track_id", inplace=True)
track_list = pd.merge(track_list, top, left_index=True, right_index=True)
track_list.sort_values(by="distance")
track_list.head(10)

Unnamed: 0_level_0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,distance
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,0.0
05om7Ac9m7wKq1rHn4sHQh,Caissie Levy;John Riddle;Original Broadway Cas...,Frozen: The Broadway Musical (Original Broadwa...,"Monster - From ""Frozen: The Broadway Musical""",53,222653,False,0.41,0.457,0,-5.622,1,0.0381,0.214,0.0,0.108,0.147,91.927,4,show-tunes,0.151903
5ELZpvTDGorz9BIE9zaBoZ,Tenth Avenue North,Followers,I Have This Hope,52,204800,False,0.422,0.336,0,-7.199,1,0.0306,0.199,0.0,0.189,0.11,108.009,4,world-music,0.168872
5EUsI3LIV042IV5ydksV9y,Mad Season,Above (Deluxe Edition),River of Deceit,57,302906,False,0.455,0.29,0,-11.473,1,0.0288,0.0614,0.00225,0.101,0.169,88.523,4,grunge,0.186107
3lqLz5HJ7JFMcOKMNIH3Uo,Phil Wickham,Living Hope (The House Sessions),How Great Is Your Love,41,394626,False,0.441,0.423,0,-9.573,1,0.0291,0.179,0.0,0.0639,0.103,73.9,4,world-music,0.189446
2Zxili8AmCuqiomg9HzTgH,Shane & Shane,"Psalms, Vol. 2","Psalm 23 (Surely Goodness, Surely Mercy)",52,287080,False,0.509,0.276,0,-10.406,1,0.0301,0.203,0.0,0.15,0.138,109.707,4,world-music,0.189546
3Z9wiXDPADfxgovNa2I6ph,Mateo;Arinity;BIMINI,Free To Love,Free To Love,58,155458,False,0.428,0.493,0,-9.754,1,0.0351,0.259,2e-06,0.084,0.154,105.138,4,french,0.193426
524KQvc46Y1X6HRomMKoBJ,Hillsong Worship,Ultimate Worship Vol 1,Still,61,320506,False,0.419,0.473,0,-6.804,1,0.0282,0.34,0.0,0.111,0.184,70.022,4,world-music,0.202581
5viFjDGTnrApmUY5c8qkfw,Mena Massoud;Naomi Scott,Aladdin (Original Motion Picture Soundtrack),A Whole New World,64,175593,False,0.395,0.375,0,-7.948,1,0.034,0.225,0.0,0.145,0.131,121.04,4,show-tunes,0.208767
36JWXzz773ljAGmwwz4ISZ,Passion;Rachel Halbach,Live From Camp,Christ Our King - Live From Camp,52,365185,False,0.301,0.47,1,-7.896,1,0.0314,0.195,0.0,0.113,0.0724,74.912,4,world-music,0.218306


In [69]:
spotify_df = pd.read_csv("../data/spotify_data.csv")
spotify_df.dropna(inplace=True)
spotify_df.drop_duplicates(subset=["track_id"], inplace=True)
spotify_df.drop(columns=["Unnamed: 0"], inplace=True)
spotify_df.set_index("track_id", inplace=True)

In [70]:
nn = NearestNeighbors(n_neighbors=10, n_jobs=1, metric="euclidean")
nn.fit(data_norm)

In [71]:
nn_query_res = nn.kneighbors(
    [
        [
            0.69,
            0.044290,
            0.490355,
            0.3030,
            0.363636,
            0.730130,
            1.0,
            0.044456,
            0.696787,
            0.000000,
            0.1150,
            0.139698,
            0.548157,
            0.6,
        ]
    ]
)
print(nn_query_res)

(array([[8.38328001e-07, 1.08748729e-01, 1.69120420e-01, 1.86801073e-01,
        2.29495531e-01, 2.35124153e-01, 2.43371268e-01, 2.49602573e-01,
        2.52034323e-01, 2.63755848e-01]]), array([[   11,   118,  7128,  8370,   259, 54462, 67090, 55379, 31064,
        28863]]))




In [72]:
spotify_df.iloc[nn_query_res[1][0]]

Unnamed: 0_level_0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5ivF4eQBqJiVL5IAE9jRyl,Jason Mraz,Love Is a Four Letter Word,I Won't Give Up,69,240165,False,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,3,acoustic
05pKAafT85jeeNhZ6kq7HT,Jason Mraz,I Won't Give Up,I Won't Give Up,66,240165,False,0.585,0.303,4,-10.058,1,0.0398,0.694,0.0,0.115,0.142,136.703,3,acoustic
711WfDztCZpnmJg7Uvwod3,The Avett Brothers,True Sadness,No Hard Feelings,57,314680,False,0.485,0.319,5,-8.525,1,0.031,0.686,2.8e-05,0.117,0.195,142.119,3,bluegrass
7BlLmanL024pf0doBknUi4,Gabriel Brito,Eu Não Sou Mais Órfão (Acústico),Eu Não Sou Mais Órfão - Acústico,54,208810,False,0.535,0.266,4,-8.483,1,0.0275,0.768,0.0,0.109,0.158,141.761,3,brazil
2Qw1KFrJ1q6qHXxWGh51kC,Boyce Avenue,"Cover Sessions, Vol. 3",A Thousand Years,56,263710,False,0.41,0.221,5,-11.673,1,0.0315,0.768,0.0,0.106,0.228,139.529,3,acoustic
1qCQTy0fTXerET4x8VHyr9,Louis Armstrong,What A Wonderful World,What A Wonderful World,73,137520,False,0.399,0.258,5,-16.028,1,0.033,0.792,2e-06,0.128,0.192,108.174,3,jazz
5nHwX7e5XHnie22nuTFgda,KK,Kites,Dil Kyun Yeh Mera,62,333200,False,0.486,0.382,6,-6.183,1,0.0286,0.613,8.6e-05,0.149,0.164,125.918,3,pop-film
6TBJkXHPhu3EsMk1bshwuI,LeeHi,4 ONLY,ONLY,77,240906,False,0.536,0.296,5,-7.451,1,0.0346,0.892,0.0,0.0873,0.151,122.907,3,k-pop
1fEGtTZjrjJW8eUeewnNJR,Iron & Wine,The Shepherd's Dog,"Flightless Bird, American Mouth",69,241917,False,0.369,0.403,3,-7.996,1,0.0259,0.827,0.0541,0.106,0.177,154.899,3,folk
4VPXFi4vFTtS9wHe6oMQaT,Zara Larsson,So Good,I Can't Fall in Love Without You,66,180640,True,0.485,0.332,4,-9.244,1,0.0338,0.784,0.000122,0.146,0.273,130.813,4,electro
