In [8]:
import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [9]:
df = pd.read_csv("data/spotify_data.csv", index_col=0)
df.head(10)

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.043,0.694,0.0,0.115,0.139,133.406,240166,3
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.026,0.477,0.0,0.097,0.515,140.182,216387,4
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.032,0.338,0.0,0.089,0.145,139.832,158960,4
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.036,0.807,0.0,0.08,0.508,204.961,304293,4
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.03,0.073,0.019,0.11,0.217,171.864,244320,4
5,Chris Smither,What They Say,24NvptbNKGs6sPy1Vh1O0v,48,2012,acoustic,0.566,0.57,2,-6.42,1,0.033,0.688,0.0,0.094,0.96,83.403,166240,4
6,Matt Wertz,Walking in a Winter Wonderland,0BP7hSvLAG3URGrEvNNbGM,48,2012,acoustic,0.575,0.606,9,-8.197,1,0.03,0.012,0.0,0.068,0.364,121.083,152307,4
7,Green River Ordinance,Dancing Shoes,3Y6BuzQCg9p4yH347Nn8OW,45,2012,acoustic,0.586,0.423,7,-7.459,1,0.026,0.252,0.0,0.098,0.318,138.133,232373,4
8,Jason Mraz,Living in the Moment,3ce7k1L4EkZppZPz1EJWTS,44,2012,acoustic,0.65,0.628,7,-7.16,1,0.023,0.048,0.0,0.119,0.7,84.141,235080,4
9,Boyce Avenue,Heaven,2EKxmYmUdAVXlaHCnnW13o,58,2012,acoustic,0.619,0.28,8,-10.238,0,0.032,0.73,0.0,0.103,0.292,129.948,250063,4


## Exploratory Data Analysis

In [20]:
df.dtypes

artist_name          object
track_name           object
track_id             object
popularity            int64
year                  int64
genre                object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms           int64
time_signature        int64
dtype: object

In [10]:
df.describe()

Unnamed: 0,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
count,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0,1159764.0
mean,18.383,2011.955,0.537,0.64,5.288,-8.981,0.635,0.093,0.322,0.252,0.223,0.456,121.377,249561.781,3.886
std,15.886,6.804,0.184,0.271,3.555,5.682,0.482,0.127,0.355,0.365,0.201,0.269,29.78,149426.165,0.468
min,0.0,2000.0,0.0,0.0,0.0,-58.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2073.0,0.0
25%,5.0,2006.0,0.413,0.454,2.0,-10.829,0.0,0.037,0.006,0.0,0.098,0.226,98.797,181091.0,4.0
50%,15.0,2012.0,0.55,0.694,5.0,-7.45,1.0,0.051,0.147,0.002,0.134,0.438,121.931,225744.0,4.0
75%,29.0,2018.0,0.677,0.873,8.0,-5.276,1.0,0.089,0.64,0.614,0.292,0.674,139.903,286913.5,4.0
max,100.0,2023.0,0.993,1.0,11.0,6.172,1.0,0.971,0.996,1.0,1.0,1.0,249.993,6000495.0,5.0


In [11]:
df.isna().any()

artist_name          True
track_name           True
track_id            False
popularity          False
year                False
genre               False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
duration_ms         False
time_signature      False
dtype: bool

In [17]:
sum(df['artist_name'].isna())

15

In [18]:
sum(df['track_name'].isna())

1

## Cosine Similarity

In [21]:
df['genre'].value_counts()

genre
black-metal       21852
gospel            21621
ambient           21389
acoustic          21097
alt-rock          20918
                  ...  
chicago-house      5170
dubstep            4774
detroit-techno     3920
rock               3319
songwriter          589
Name: count, Length: 82, dtype: int64

In [46]:
#Selecting only the features that range from 0.0 to 1.0
def cosine_similarity(trackid_a, trackid_b):
    columns = ['danceability', 'energy', 'acousticness', 'instrumentalness', 'liveness', 'valence']
    track_a = df[df['track_id'] == trackid_a][columns]
    track_b = df[df['track_id'] == trackid_b][columns]
    vector_a = np.squeeze(np.array(track_a), axis=0)
    vector_b = np.squeeze(np.array(track_b), axis=0)
    
    dot_product = np.dot(vector_a, vector_b)
    
    magnitude_a = np.linalg.norm(vector_a)
    magnitude_b = np.linalg.norm(vector_b)
    
    return dot_product / (magnitude_a * magnitude_b)
    
a = df['track_id'].sample().iloc[0]
b = df['track_id'].sample().iloc[0]
visual = df[(df['track_id'] == a) | (df['track_id'] == b)]
print(cosine_similarity(a, b))
visual


0.6319965136270344


Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
618894,Guantas,Carna Guantas (Medley Carnaval) - ao vivo Espa...,2dxcLsndMWHhLsQ7vZ2IIz,0,2023,ska,0.674,0.532,10,-11.109,1,0.05,0.558,0.0,0.464,0.972,143.364,298665,4
655980,The Damage Manual,Stateless (Laswell Mix),6X2IuHZSec7dE97OlujB4j,0,2000,industrial,0.473,0.55,10,-6.625,1,0.032,0.0,0.137,0.121,0.037,145.135,393427,4
