# PROJECT SECTIONS
- Dataset & Preprocessing
  - Fetching & Comparing Artist Genres w/ Spotify API
  - Moving/Dropping Columns
  - Scale Features
- Principle Component Analysis (PCA)
  - Apply on Features
  - Researching a More Optimal Feature Set
- Evalution Metrics
  - Elbow-Method (Experimenting w/ k value)
  - Silhouette Score ...
- KMeans
  - Fitting the Model
  - Light Feature Engineering (Cluster #, Dist to Centroid)
  - Cluster Subset & Filtering Output

In [None]:
# importing dependencies
from datasets import load_dataset
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import requests

  from .autonotebook import tqdm as notebook_tqdm


## Dataset & Preprocessing

In [None]:
# loading from huggingface
# dataset_hf = load_dataset("maharshipandya/spotify-tracks-dataset")
# dataset = pd.DataFrame(dataset_hf['train'])   #convert dataset to pandas dataframe

#loading from local csv
dataset = pd.read_csv('spotify-tracks-dataset.csv')

# copy the dataset (for reference of original attributes)
data = dataset.copy()

# get genres from the dataset
genres = data['track_genre'].unique()
genres

# format genres to be searchable by keywords
genres_searchable = []
for genre in genres:
    if '-' in genre:    # format hyphenated genres into lists of strings
        genres_searchable.append(genre.split('-'))
    else:
        genres_searchable.append([genre])
genres_searchable
        


[['acoustic'],
 ['afrobeat'],
 ['alt', 'rock'],
 ['alternative'],
 ['ambient'],
 ['anime'],
 ['black', 'metal'],
 ['bluegrass'],
 ['blues'],
 ['brazil'],
 ['breakbeat'],
 ['british'],
 ['cantopop'],
 ['chicago', 'house'],
 ['children'],
 ['chill'],
 ['classical'],
 ['club'],
 ['comedy'],
 ['country'],
 ['dance'],
 ['dancehall'],
 ['death', 'metal'],
 ['deep', 'house'],
 ['detroit', 'techno'],
 ['disco'],
 ['disney'],
 ['drum', 'and', 'bass'],
 ['dub'],
 ['dubstep'],
 ['edm'],
 ['electro'],
 ['electronic'],
 ['emo'],
 ['folk'],
 ['forro'],
 ['french'],
 ['funk'],
 ['garage'],
 ['german'],
 ['gospel'],
 ['goth'],
 ['grindcore'],
 ['groove'],
 ['grunge'],
 ['guitar'],
 ['happy'],
 ['hard', 'rock'],
 ['hardcore'],
 ['hardstyle'],
 ['heavy', 'metal'],
 ['hip', 'hop'],
 ['honky', 'tonk'],
 ['house'],
 ['idm'],
 ['indian'],
 ['indie', 'pop'],
 ['indie'],
 ['industrial'],
 ['iranian'],
 ['j', 'dance'],
 ['j', 'idol'],
 ['j', 'pop'],
 ['j', 'rock'],
 ['jazz'],
 ['k', 'pop'],
 ['kids'],
 ['latin

#### Authenticating to Spotify

In [None]:
# load environment variables from .env file
from dotenv import load_dotenv
import os
load_dotenv()

# auth spotify api
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')
auth_cmd = f"curl -X POST 'https://accounts.spotify.com/api/token' \
     -H 'Content-Type: application/x-www-form-urlencoded' \
     -d 'grant_type=client_credentials&client_id={client_id}&client_secret={client_secret}'"

# saving token to JSON file
! {auth_cmd} > token.json

# loading the token into a python variable
with open('token.json', 'r') as file:
    token_json = file.read()
    token = token_json.split('"')[3]
    token_type = token_json.split('"')[7]

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   293  100   174  100   119    564    385 --:--:-- --:--:-- --:--:--   951


#### Define Track/Song (via API)

In [None]:
# test track audio features
track_id = '7xcqbjV2NfxlnJzqdRuO7E'
aud_features_url = "https://api.spotify.com/v1/audio-features/"

aud_features_json = requests.get(aud_features_url + track_id, headers={'Authorization': f'{token_type} {token}'}).json()

In [None]:
# test get track
track_url = "https://api.spotify.com/v1/tracks/"

# get track json data
track_json = requests.get(track_url + track_id, headers={'Authorization': f'{token_type} {token}'}).json()

# extract track artists and name
track_artists_list = [artist['name'] for artist in track_json['artists']]
track_artists_str = ', '.join(track_artists_list)
track_name = track_json['name']
artist_id = track_json['artists'][0]['id']
track_name, track_artists_str

('Odd Look', 'Kavinsky, The Weeknd')

#### Fetch Track Artist's Genres

In [None]:
# TODO: See if genres_searchable is preventing matches 
# TODO: Ensure multi-artist tracks include all genres

artist_url = "https://api.spotify.com/v1/artists/"

# get album json data
artist_json = requests.get(artist_url + artist_id, headers={'Authorization': f'{token_type} {token}'}).json()
artist_genres = artist_json['genres']

# separate words in artist genres
artist_genres_words = [genre.split() for genre in artist_genres]

# flatten list
artist_genres_words = list(set([word for genre in artist_genres_words for word in genre]))
print(f"artist_genres_words: {artist_genres_words}") # TODO: fix

# print genres that are in the dataset
genre_response = 'GENRES IN DATASET & ASSOCIATED W/ TRACK ARTIST: '
for word in artist_genres_words:
    if word in genres:
        genre_response += '\n' + word
print(genre_response)

artist_genres_words: ['house', 'filter']
GENRES IN DATASET & ASSOCIATED W/ TRACK ARTIST: 
house


#### Drop & Move Columns

In [None]:
# unnecessary columns TODO: Check Train Dataset for Columns or Features
cols_to_drop = ['Unnamed: 0', 'album_name',
                'loudness', 'explicit', 'time_signature',
                'instrumentalness', 'duration_ms', 'mode', 'valence']

dataset.drop(cols_to_drop, axis=1, inplace=True)

# drop duplicates (tracks with the same track_id)
# keep the first occurrence
dataset.drop_duplicates(subset='track_id', keep='first', inplace=True)
dataset.drop_duplicates(subset='track_name', keep='first', inplace=True)

position_map = {
    "track_name": 0,
    "artists": 1,
    "track_genre": 2,
    "track_id": 3
}

# use position map to re-order important columns
for attribute,position in position_map.items():
    col_name = dataset[attribute]
    dataset.drop(labels=[attribute], axis=1, inplace=True)
    dataset.insert(position, attribute, col_name)


# print first 5 rows
dataset.head()

Unnamed: 0,track_name,artists,track_genre,track_id,popularity,danceability,energy,key,speechiness,acousticness,liveness,tempo
0,Comedy,Gen Hoshino,acoustic,5SuOikwiRyPMVoIQDJUgSV,73,0.676,0.461,1,0.143,0.0322,0.358,87.917
1,Ghost - Acoustic,Ben Woodward,acoustic,4qPNDBW1i3p13qLCt0Ki3A,55,0.42,0.166,1,0.0763,0.924,0.101,77.489
2,To Begin Again,Ingrid Michaelson;ZAYN,acoustic,1iJBSr7s7jYXzM8EGcbK5b,57,0.438,0.359,0,0.0557,0.21,0.117,76.332
3,Can't Help Falling In Love,Kina Grannis,acoustic,6lfxq3CG4xtTiEg7opyCyx,71,0.266,0.0596,0,0.0363,0.905,0.132,181.74
4,Hold On,Chord Overstreet,acoustic,5vjLSffimiIP26QG5WcN2K,82,0.618,0.443,2,0.0526,0.469,0.0829,119.949


## Principle Component Analysis (PCA)

## Evalution Metrics


## KMeans

## Sort Dataset by Cluster & Distance-to-Cluster


In [None]:
# add a column to the dataset for the cluster number and distance from the centroid
dataset['cluster'] = pred_y
dataset['distance'] = kmeans.fit_transform(data_pca).min(axis=1)

# sort the dataset by cluster and distance
dataset = dataset.sort_values(by=['cluster', 'distance'])

# print the first 10 rows
dataset.head(10)


# Song Recommendations

In [None]:
# search for song's full data using the track_id
song_matches = dataset[dataset['track_id'] == track_id]

# get the first occurrence
full_given_song = song_matches.loc[song_matches.first_valid_index()]

# save the numerical data in a separate variable
given_song = full_given_song[4:].astype(float)

# print the full given song data
print(f"DATA FROM: {track_name} by {track_artists_str}")
given_song

## Initial Cluster Points & Filtering

In [None]:
# find the closest centroid

# get the index of the closest centroid
closest_centroid_idx = int(given_song['cluster'])

# get the coordinates of the closest centroid
closest_centroid = kmeans.cluster_centers_[closest_centroid_idx]

# Find data points belonging to the closest cluster
cluster_points = dataset.iloc[:, :][kmeans.labels_ == closest_centroid_idx]

# Remove the given song from the closest cluster if it exists
cluster_points = cluster_points[cluster_points['track_name'] != full_given_song['track_name']]

# Only consider songs of the same genre
# TODO: create clusters of genres that are similar and only from the that cluster
cluster_points = cluster_points[cluster_points['track_genre'] == full_given_song['track_genre']]

# print the amount of songs filtered
print(cluster_points.shape)

cluster_points.head(10)