# **Import Libraries**

In [1]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

# **Read Data**

In [2]:
data = pd.read_csv("data/data.csv")
genre_data = pd.read_csv('data/data_by_genres.csv')
year_data = pd.read_csv('data/data_by_year.csv')

# **Clustering Genres with K-Means**

Here, the simple K-means clustering algorithm is used to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.

In [3]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

# **Clustering Songs with K-Means**

In [4]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False))
                                 ], verbose=False)

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

# **Build Recommender System**

* Based on the analysis and visualizations, it’s clear that similar genres tend to have data points that are located close to each other while similar types of songs are also clustered together.
* This observation makes perfect sense. Similar genres will sound similar and will come from similar time periods while the same can be said for songs within those genres. We can use this idea to build a recommendation system by taking the data points of the songs a user has listened to and recommending songs corresponding to nearby data points.
* [Spotipy](https://spotipy.readthedocs.io/en/2.16.1/) is a Python client for the Spotify Web API that makes it easy for developers to fetch data and query Spotify’s catalog for songs. You have to install using `pip install spotipy`
* After installing Spotipy, you will need to create an app on the [Spotify Developer’s page](https://developer.spotify.com/) and save your Client ID and secret key.

In [6]:
# !pip install spotipy

In [7]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

# Load environment variables from .env file
load_dotenv()

# Retrieve the credentials from environment variables
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
                                                           client_secret=client_secret))

def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [8]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

import numpy as np

def get_mean_vector(song_list, spotify_data):
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in the database'.format(song['name']))
            continue
        
        # Extract numerical features and ensure consistent order
        song_vector = []
        for col in number_cols:
            if col in song_data and isinstance(song_data[col], (int, float)):
                song_vector.append(song_data[col])
            else:
                # Handle missing feature by appending NaN
                song_vector.append(np.nan)
        
        print(f'Song vector for {song["name"]}: {song_vector}')  # Debug print
        song_vectors.append(song_vector)
    
    # Check if any song vectors were obtained
    if not song_vectors:
        return None
    
    song_matrix = np.array(song_vectors)
    return np.nanmean(song_matrix, axis=0)




def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [9]:
recommend_songs([{'name': 'Come As You Are', 'year':1991},
                {'name': 'Smells Like Teen Spirit', 'year': 1991},
                {'name': 'Lithium', 'year': 1992},
                {'name': 'All Apologies', 'year': 1993},
                {'name': 'Stay Away', 'year': 1993}],  data)

Song vector for Come As You Are: [0.539, nan, 0.00016, 0.5, nan, 0.8240000000000001, nan, 0.00161, nan, 0.0916, -5.846, nan, nan, 0.0388, 120.125]
Song vector for Smells Like Teen Spirit: [0.72, nan, 2.55e-05, 0.502, nan, 0.912, nan, 0.000173, nan, 0.106, -4.556, nan, nan, 0.0564, 116.761]
Song vector for Lithium: [0.388, nan, 0.000203, 0.688, nan, 0.599, nan, 0.0, nan, 0.0782, -9.176, nan, nan, 0.038, 123.265]
Song vector for All Apologies: [0.359, nan, 0.0793, 0.446, nan, 0.632, nan, 0.000266, nan, 0.0881, -12.197, nan, nan, 0.034, 113.176]
Song vector for Stay Away: [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]


[{'name': 'Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve',
  'year': 1921,
  'artists': "['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']"},
 {'name': 'Shipwreck', 'year': 1964, 'artists': "['Laura Olsher']"},
 {'name': "You Can't Love 'Em All",
  'year': 1964,
  'artists': "['Solomon Burke']"},
 {'name': 'Dixie Breakdown - Recorded at the Mecca, Los Angeles',
  'year': 1964,
  'artists': "['The Dillards']"},
 {'name': 'Charu Theme - Instrumental',
  'year': 1964,
  'artists': "['Satyajit Ray']"},
 {'name': "Mama Don't Whip Little Buford",
  'year': 1964,
  'artists': "['Homer & Jethro']"},
 {'name': 'Body And Soul', 'year': 1964, 'artists': "['Freddie Hubbard']"},
 {'name': 'I Smiled Yesterday', 'year': 1964, 'artists': "['Dionne Warwick']"},
 {'name': 'Home Boys Home', 'year': 1964, 'artists': "['The Dubliners']"},
 {'name': 'The Warmth of the Sun (Stereo)',
  'year': 1964,
  'artists': "['The Beach Boys']"}]

* This last cell will gives you a recommendation list of songs like this,


```
[{'name': 'Life is a Highway - From "Cars"',
  'year': 2009,
  'artists': "['Rascal Flatts']"},
 {'name': 'Of Wolf And Man', 'year': 1991, 'artists': "['Metallica']"},
 {'name': 'Somebody Like You', 'year': 2002, 'artists': "['Keith Urban']"},
 {'name': 'Kayleigh', 'year': 1992, 'artists': "['Marillion']"},
 {'name': 'Little Secrets', 'year': 2009, 'artists': "['Passion Pit']"},
 {'name': 'No Excuses', 'year': 1994, 'artists': "['Alice In Chains']"},
 {'name': 'Corazón Mágico', 'year': 1995, 'artists': "['Los Fugitivos']"},
 {'name': 'If Today Was Your Last Day',
  'year': 2008,
  'artists': "['Nickelback']"},
 {'name': "Let's Get Rocked", 'year': 1992, 'artists': "['Def Leppard']"},
 {'name': "Breakfast At Tiffany's",
  'year': 1995,
  'artists': "['Deep Blue Something']"}]
```



* You can change the given songs list as per your choice.