<a href="https://colab.research.google.com/github/jarodchristiansen/Machine-Learning-Deep-Learning/blob/master/Spotify_Recommendation_Algo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup Spotify API

In [1]:
!pip install spotipy

Collecting spotipy
  Downloading spotipy-2.24.0-py3-none-any.whl.metadata (4.9 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.8-py3-none-any.whl.metadata (9.2 kB)
Downloading spotipy-2.24.0-py3-none-any.whl (30 kB)
Downloading redis-5.0.8-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.6/255.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: redis, spotipy
Successfully installed redis-5.0.8 spotipy-2.24.0


In [15]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from google.colab import userdata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
# Set up Spotify API credentials
client_id = userdata.get('spotify_id')
client_secret = userdata.get('spotify_secret')

# Authenticate using Client Credentials Flow
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)


## Methods to gather initial tracks for seed dataset

### External methods to enhance specifity

In [14]:
def search_tracks_by_genre(genre, limit=50):
    """
    Fetches track IDs by searching for a specific genre.

    Args:
    - genre (str): Genre keyword to search for.
    - limit (int): Maximum number of tracks to fetch.

    Returns:
    - track_ids (list): List of track IDs.
    """
    track_ids = []

    # Search for tracks by genre
    results = sp.search(q=f'genre:{genre}', type='track', limit=limit)
    tracks = results['tracks']['items']

    # Collect track IDs
    for track in tracks:
        track_ids.append(track['id'])

    return track_ids

# Example usage
genre = 'pop'  # You can replace this with any genre you prefer
pop_tracks = search_tracks_by_genre(genre, limit=50)
print(pop_tracks)


['0WbMK4wrZ1wFSty9F7FCgu', '6dOtVTDdiauQNBQEDOtlAB', '2plbrEY59IikOBgBGLjaoe', '5G2f63n7IPVPPjfNIGih7Q', '5N3hjp1WNayUPZrA8kJmJP', '2qSkIjg1o9h3YT9RAgYN75', '4xdBrk0nFZaP54vvZj0yx7', '1UHS8Rf6h5Ar3CDWRd3wjF', '1k2pQc5i348DCHwbn5KTdc', '7221xIgOnuakPdLqT0F3nP', '7FOgcfdz9Nx5V9lCNXdBYv', '102YUQbYmwdBXS7jwamI90', '0mflMxspEfB0VbI1kyLiAv', '3WOhcATHxK2SLNeP5W3v1v', '2FQrifJ1N335Ljm3TjTVVf', '7tI8dRuH2Yc6RuoTjxo4dU', '21B4gaTWnTkuSh77iWEXdS', '19RybK6XDbAVpcdxSbZL1o', '0UYnhUfnUj5adChuAXvLUB', '3WSOUb3U7tqURbBSgZTrZX', '3QaPy1KgI7nu9FJEQUgn6h', '629DixmZGHc7ILtEntuiWE', '2QjOHCTQ1Jl3zawyYOpxh6', '5fZJQrFKWQLb7FpJXZ1g7K', '51eSHglvG1RJXtL3qI5trr', '3iPIDAFybaoyqX7hvAfWkl', '5oIVNm56t6OIf9ZjdEG3ud', '3Vr3zh0r7ALn8VLqCiRR10', '3xkHsmpQCBMytMJNiDf3Ii', '1BxfuPKGuaTgP7aM0Bbdwr', '5IZXB5IKAD2qlvTPJYDCFB', '4w2GLmK2wnioVnb5CPQeex', '53IRnAWx13PYmoVYtemUBS', '3qhlB30KknSejmIvZZLjOD', '0XkZmBCCcdMY0EPY8ij6Gb', '1bjeWoagtHmUKputLVyDxQ', '0AjmK0Eai4zGrLaJwPvrDp', '7BRD7x5pt8Lqa1eGYC4dzj', '7iQMm50NNw

In [8]:
def get_playlist_tracks(playlist_id, limit=100):
    """
    Fetches track IDs from a specific playlist.

    Args:
    - playlist_id (str): The Spotify playlist ID.
    - limit (int): Number of tracks to fetch (max 100 per request).

    Returns:
    - track_ids (list): List of track IDs from the playlist.
    """
    track_ids = []
    results = sp.playlist_tracks(playlist_id, limit=limit)

    # Collect track IDs from the playlist
    for item in results['items']:
        track = item['track']
        track_ids.append(track['id'])

    return track_ids

# Example usage
playlist_id = '37i9dQZEVXbMDoHDwVN2tF'  # Spotify Top 50 Global playlist
top_50_tracks = get_playlist_tracks(playlist_id, limit=50)
print(top_50_tracks)


['2plbrEY59IikOBgBGLjaoe', '6dOtVTDdiauQNBQEDOtlAB', '5G2f63n7IPVPPjfNIGih7Q', '7tI8dRuH2Yc6RuoTjxo4dU', '2qSkIjg1o9h3YT9RAgYN75', '0WbMK4wrZ1wFSty9F7FCgu', '6WatFBLVB0x077xWeoVc2k', '5N3hjp1WNayUPZrA8kJmJP', '2PnlsTsOTLE5jnBnNe2K0A', '3xkHsmpQCBMytMJNiDf3Ii', '1UHS8Rf6h5Ar3CDWRd3wjF', '5fZJQrFKWQLb7FpJXZ1g7K', '17phhZDn6oGtzMe56NuWvj', '2cZOYofOX4d6g0OXxkaIjA', '3hRV0jL3vUpRrcy398teAU', '5Z0UnEtpLDQyYlWwgi8m9C', '7CyPwkp0oE8Ro9Dd5CUDjW', '2esZG2XFtuoWWA9AfDvSxy', '7z7kvUQGwlC6iOl7vMuAr9', '3WOhcATHxK2SLNeP5W3v1v', '0OA00aPt3BV10qeMIs3meW', '2QjOHCTQ1Jl3zawyYOpxh6', '5XeFesFbtLpXzIVDNQP22n', '6AI3ezQ4o3HUoP6Dhudph3', '4xdBrk0nFZaP54vvZj0yx7', '5AJ9hqTS2wcFQCELCFRO7A', '5IZXB5IKAD2qlvTPJYDCFB', '51ZQ1vr10ffzbwIjDCwqm4', '2nLtzopw4rPReszdYBJU6h', '42VsgItocQwOQC3XWZ8JNA', '62bOmKYxYg7dhrC6gH9vFn', '51rfRCiUSvxXlCSCfIztBy', '7ov3TDp5D00Rnu5R1viX4w', '0UYnhUfnUj5adChuAXvLUB', '3QaPy1KgI7nu9FJEQUgn6h', '3qhlB30KknSejmIvZZLjOD', '3AJwUDP919kvQ9QcozQPxg', '2aYZaN5SmkRDLsrrV8GkBQ', '1BxfuPKGua

In [9]:
def get_user_saved_tracks(limit=50):
    """
    Fetches the current user's saved track IDs.

    Args:
    - limit (int): Number of saved tracks to fetch (max 50 per request).

    Returns:
    - track_ids (list): List of track IDs from the user's saved tracks.
    """
    track_ids = []

    # Get current user's saved tracks
    results = sp.current_user_saved_tracks(limit=limit)

    # Collect track IDs
    for item in results['items']:
        track = item['track']
        track_ids.append(track['id'])

    return track_ids

# Example usage
user_saved_tracks = get_user_saved_tracks(limit=50)
print(user_saved_tracks)


ERROR:spotipy.client:HTTP Error for GET to https://api.spotify.com/v1/me/tracks with Params: {'limit': 50, 'offset': 0, 'market': None} returned 403 due to Forbidden.


SpotifyException: http status: 403, code:-1 - https://api.spotify.com/v1/me/tracks?limit=50&offset=0:
 Forbidden., reason: None

In [10]:
def get_tracks_from_artist(artist_name, limit=50):
    """
    Fetches track IDs from albums of a specific artist.

    Args:
    - artist_name (str): The name of the artist.
    - limit (int): Number of tracks to fetch.

    Returns:
    - track_ids (list): List of track IDs.
    """
    track_ids = []

    # Search for the artist by name
    results = sp.search(q=f'artist:{artist_name}', type='artist', limit=1)
    artist = results['artists']['items'][0]
    artist_id = artist['id']

    # Get the artist's albums
    albums = sp.artist_albums(artist_id, limit=limit)

    # Collect track IDs from each album
    for album in albums['items']:
        album_tracks = sp.album_tracks(album['id'], limit=50)
        for track in album_tracks['items']:
            track_ids.append(track['id'])

    return track_ids

# Example usage
artist_tracks = get_tracks_from_artist('Taylor Swift', limit=50)
print(artist_tracks)


['6dODwocEuGzHAavXqTbwHv', '4PdLaGZubp4lghChqp8erB', '7uGYWMwRy24dm7RUDDhUlD', '1kbEbBdEgQdQeLXCJh28pJ', '7wAkQFShJ27V8362MqevQr', '4QMgEffJQuKtjCNvqfRZ0m', '7IWcDWOfiooH5hRs9XOVYz', '5ExOm0dh4NyRyAdSAO9hyM', '799KrpEbhZp0MHeiA8YK9P', '2d8UxVNhJinc8uat9PoM9y', '5chnRTB9qMK3W1M41SnU9s', '3YkNIrAvbKNrrwwEd7NVLl', '2fPvQfGQEZOKtJ9qXeL4x8', '1xtw1krCR6Dw2KwkXw5z63', '1tuNqJOtRQVHvONR8Lg3MZ', '4d9PtIEVij9jW5OaLinH66', '62E2nR0od0M5HYxuYLaDz7', '1kcwpPDQnqEqmezzXdJTCP', '4EF6IyONolQy0bIQXm2EmX', '1rmEsOezwf2lmIZTMAO5Ag', '5Bedn0svl0ZD7RGmJkmKKw', '7Mts0OfPorF4iwOomvfqn1', '3hlGuz3loYoLfI3bpwieWq', '7ogK4lJDVDMU6A6vYR5rvD', '1Zai5UJ2di3qEuR2HeT2s8', '18WFFUIsewmA8g31KAeo3e', '0g4fMVo4JjwnIpTfFfLdxS', '3zMDGj4D8ogaYgAIZPeU7S', '2913xXOVAIDAqxzV2g4VcU', '2CnjDMdpRjlWv04Xk3s6MW', '1DTRUYVd8rYpla9hhVVwjo', '2OzhQlSqBEmt7hmkYxfT6m', '3NMrVbIVWT3fPXBj0rNDKG', '2XXwLdtuAcE0HSCu61ijAb', '2F3N9tdombb64aW6VtZOdo', '3Vevii7qKqrmW8CcyzBHDl', '5og4Qzt92jJzVDkOtSEilb', '3fO566xJgwxIa3qGCGBvIC', '3ZVFcD8Wlw

### Bulk dataset gathering before getting recommendations/features

In [9]:
def get_available_genres():
    """
    Fetches a list of available genre seeds from Spotify API.

    Returns:
    - genres (list): List of available genres.
    """
    genres = sp.recommendation_genre_seeds()['genres']
    return genres

# Example usage
available_genres = get_available_genres()
print(available_genres)


['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient', 'anime', 'black-metal', 'bluegrass', 'blues', 'bossanova', 'brazil', 'breakbeat', 'british', 'cantopop', 'chicago-house', 'children', 'chill', 'classical', 'club', 'comedy', 'country', 'dance', 'dancehall', 'death-metal', 'deep-house', 'detroit-techno', 'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk', 'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove', 'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle', 'heavy-metal', 'hip-hop', 'holidays', 'honky-tonk', 'house', 'idm', 'indian', 'indie', 'indie-pop', 'industrial', 'iranian', 'j-dance', 'j-idol', 'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino', 'malay', 'mandopop', 'metal', 'metal-misc', 'metalcore', 'minimal-techno', 'movies', 'mpb', 'new-age', 'new-release', 'opera', 'pagode', 'party', 'philippines-opm', 'piano', 'pop', 'pop-film', 'post-dubstep', 'power-po

In [24]:
def build_large_track_dataset(genres, playlists, num_tracks_per_source=50):
    """
    Builds a dataset of tracks by combining tracks from multiple genres and playlists.

    Args:
    - genres (list): List of genres to search.
    - playlists (list): List of playlist IDs to pull tracks from.
    - num_tracks_per_source (int): Number of tracks to fetch per genre/playlist.

    Returns:
    - tracks_df (pd.DataFrame): DataFrame containing track information.
    """
    track_data = []

    # Fetch tracks by genre
    for genre in genres:
        results = sp.search(q=f'genre:{genre}', type='track', limit=num_tracks_per_source)
        tracks = results['tracks']['items']

        # Collect relevant track information
        for track in tracks:
            track_info = {
                'track_id': track['id'],
                'track_name': track['name'],
                'artist_name': track['artists'][0]['name'],  # Take the first artist listed
                'album_name': track['album']['name'],
                'release_date': track['album']['release_date'],
                'popularity': track['popularity'],
                'genre_source': genre  # Save which genre the track came from
            }
            track_data.append(track_info)

    # Fetch tracks from playlists
    for playlist_id in playlists:
        results = sp.playlist_tracks(playlist_id, limit=num_tracks_per_source)
        tracks = results['items']

        for item in tracks:
            track = item['track']
            track_info = {
                'track_id': track['id'],
                'track_name': track['name'],
                'artist_name': track['artists'][0]['name'],
                'album_name': track['album']['name'],
                'release_date': track['album']['release_date'],
                'popularity': track['popularity'],
                'playlist_source': playlist_id  # Save which playlist the track came from
            }
            track_data.append(track_info)

    # Convert list of track data to a DataFrame
    tracks_df = pd.DataFrame(track_data)

    return tracks_df

# Example usage
# genres = ['pop', 'rock', 'hip-hop']
playlists = ['37i9dQZEVXbMDoHDwVN2tF', '37i9dQZF1DWXRqgorJj26U']  # Top 50 Global and USA

# Fetch the dataset
tracks_df = build_large_track_dataset(available_genres, playlists, num_tracks_per_source=50)

# Display the first few rows of the dataframe
tracks_df


Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,genre_source,playlist_source
0,1HMQmOWrkieKYWlFsjUP3D,Bloom - Bonus Track,The Paper Kites,Woodland,2013-03-05,77,acoustic,
1,6uHvbKL0Yi37AuvNRmUfMw,Paint,The Paper Kites,Young North,2013-03-05,73,acoustic,
2,7jIAttgQTpLDoNtykIQXjH,Blister In The Sun,Violent Femmes,Violent Femmes,1983-01-01,72,acoustic,
3,4E6cwWJWZw2zWf7VFbH7wf,Love Song,Sara Bareilles,Little Voice,2007-07-03,74,acoustic,
4,1jyddn36UN4tVsJGtaJfem,You Are the Best Thing,Ray LaMontagne,Gossip In The Grain,2008-10-13,69,acoustic,
...,...,...,...,...,...,...,...,...
5745,7GonnnalI2s19OCQO1J7Tf,Kickstart My Heart,Mötley Crüe,Dr. Feelgood,1989,2,,37i9dQZF1DWXRqgorJj26U
5746,5LNiqEqpDc8TuqPy79kDBu,Edge of Seventeen - 2016 Remaster,Stevie Nicks,Bella Donna (Deluxe Edition),2016-11-04,56,,37i9dQZF1DWXRqgorJj26U
5747,6NxsCnLeLd8Ai1TrgGxzIx,Bad Moon Rising,Creedence Clearwater Revival,Green River (40th Anniversary Edition),1969-08-03,0,,37i9dQZF1DWXRqgorJj26U
5748,5eYwDBLucWfWI5KsV7oYX2,Mary Jane's Last Dance,Tom Petty and the Heartbreakers,Anthology: Through The Years,2000-01-01,0,,37i9dQZF1DWXRqgorJj26U


In [25]:
tracks_df.to_csv('tracks_df_og.csv', index=False)

In [16]:
import pandas as pd

def load_existing_dataset(filepath):
    """
    Load an existing dataset from a CSV file.

    Args:
    - filepath (str): Path to the CSV file.

    Returns:
    - df (pd.DataFrame): Loaded DataFrame.
    """
    try:
        return pd.read_csv(filepath)
    except FileNotFoundError:
        print(f"File {filepath} not found. Returning an empty DataFrame.")
        return pd.DataFrame()

def save_dataset(df, filepath):
    """
    Save the dataset to a CSV file.

    Args:
    - df (pd.DataFrame): DataFrame to save.
    - filepath (str): Path to save the CSV file.
    """
    df.to_csv(filepath, index=False)
    print(f"Dataset saved to {filepath}")

def fetch_tracks_by_artist(artist_name, limit=50):
    """
    Fetches tracks by a specific artist.

    Args:
    - artist_name (str): The artist's name.
    - limit (int): Number of tracks to fetch.

    Returns:
    - track_data (list): List of track information dictionaries.
    """
    track_data = []
    results = sp.search(q=f'artist:{artist_name}', type='track', limit=limit)
    tracks = results['tracks']['items']

    for track in tracks:
        track_info = {
            'track_id': track['id'],
            'track_name': track['name'],
            'artist_name': track['artists'][0]['name'],  # First artist listed
            'album_name': track['album']['name'],
            'release_date': track['album']['release_date'],
            'popularity': track['popularity']
        }
        track_data.append(track_info)

    return track_data

def update_dataset_with_artist_tracks(df, artist_names, limit_per_artist=50):
    """
    Updates the dataset by fetching additional tracks for each unique artist.

    Args:
    - df (pd.DataFrame): Existing DataFrame with track information.
    - artist_names (list): List of artist names to fetch more tracks for.
    - limit_per_artist (int): Number of tracks to fetch per artist.

    Returns:
    - updated_df (pd.DataFrame): DataFrame with additional tracks.
    """
    existing_track_ids = set(df['track_id'].tolist())  # Track existing track IDs to avoid duplicates
    new_track_data = []

    for artist_name in artist_names:
        print(f"Fetching tracks for artist: {artist_name}")
        artist_tracks = fetch_tracks_by_artist(artist_name, limit=limit_per_artist)

        for track_info in artist_tracks:
            if track_info['track_id'] not in existing_track_ids:  # Avoid duplicates
                new_track_data.append(track_info)

    # Convert the new tracks to a DataFrame and append to the existing one
    if new_track_data:
        new_tracks_df = pd.DataFrame(new_track_data)
        updated_df = pd.concat([df, new_tracks_df], ignore_index=True)
    else:
        updated_df = df  # No new data, return original

    return updated_df

# Example usage:

# Load existing dataset
filepath = "existing_tracks_dataset.csv"
tracks_df = load_existing_dataset('tracks_df_og.csv')

# Get unique artist names
unique_artists = tracks_df['artist_name'].unique()

# Fetch additional tracks by these artists
updated_tracks_df = update_dataset_with_artist_tracks(tracks_df, unique_artists)

# Save the updated dataset
save_dataset(updated_tracks_df, 'tracks_df_og-w-artists.csv')

updated_tracks_df

Fetching tracks for artist: The Paper Kites
Fetching tracks for artist: Violent Femmes
Fetching tracks for artist: Sara Bareilles
Fetching tracks for artist: Ray LaMontagne
Fetching tracks for artist: Jason Mraz
Fetching tracks for artist: Chord Overstreet
Fetching tracks for artist: Iron & Wine
Fetching tracks for artist: Matt Nathanson
Fetching tracks for artist: Drew Holcomb & The Neighbors
Fetching tracks for artist: Ben Rector
Fetching tracks for artist: Disney Peaceful Guitar
Fetching tracks for artist: Ingrid Michaelson
Fetching tracks for artist: Anna Nalick
Fetching tracks for artist: Ichiko Aoba
Fetching tracks for artist: Kurt Cobain
Fetching tracks for artist: Howie Day
Fetching tracks for artist: Brandi Carlile
Fetching tracks for artist: Amos Lee
Fetching tracks for artist: Eddie Vedder
Fetching tracks for artist: Kina Grannis
Fetching tracks for artist: Augustana
Fetching tracks for artist: Dr. John
Fetching tracks for artist: Eric Hutchinson
Fetching tracks for artist: 

Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,genre_source,playlist_source
0,1HMQmOWrkieKYWlFsjUP3D,Bloom - Bonus Track,The Paper Kites,Woodland,2013-03-05,77,acoustic,
1,6uHvbKL0Yi37AuvNRmUfMw,Paint,The Paper Kites,Young North,2013-03-05,73,acoustic,
2,7jIAttgQTpLDoNtykIQXjH,Blister In The Sun,Violent Femmes,Violent Femmes,1983-01-01,72,acoustic,
3,4E6cwWJWZw2zWf7VFbH7wf,Love Song,Sara Bareilles,Little Voice,2007-07-03,74,acoustic,
4,1jyddn36UN4tVsJGtaJfem,You Are the Best Thing,Ray LaMontagne,Gossip In The Grain,2008-10-13,69,acoustic,
...,...,...,...,...,...,...,...,...
91268,1DqYqVLxUsR6NLUH1ys3AU,If Anyone Falls - 2016 Remaster,Stevie Nicks,The Wild Heart (2016 Remastered),1983-06-10,36,,
91269,0Adyxuv3X9l2CtMt0OeY5M,Stop Draggin' My Heart Around (with Tom Petty ...,Stevie Nicks,Bella Donna (Deluxe Edition),2016-11-04,36,,
91270,48RJAYTcIXuBPg55EzbaEq,Stand Back - 2016 Remaster,Stevie Nicks,The Wild Heart (2016 Remastered),1983-06-10,37,,
91271,35I5lX6yE00YSu7PEgES54,Sleeping Angel (From Fast Times at Ridgemont H...,Stevie Nicks,Bella Donna (Deluxe Edition),2016-11-04,34,,


In [18]:
track_df = pd.read_csv('tracks_df_og-w-artists.csv')
track_df

  track_df = pd.read_csv('tracks_df_og-w-artists.csv')


Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,genre_source,playlist_source
0,1HMQmOWrkieKYWlFsjUP3D,Bloom - Bonus Track,The Paper Kites,Woodland,2013-03-05,77,acoustic,
1,6uHvbKL0Yi37AuvNRmUfMw,Paint,The Paper Kites,Young North,2013-03-05,73,acoustic,
2,7jIAttgQTpLDoNtykIQXjH,Blister In The Sun,Violent Femmes,Violent Femmes,1983-01-01,72,acoustic,
3,4E6cwWJWZw2zWf7VFbH7wf,Love Song,Sara Bareilles,Little Voice,2007-07-03,74,acoustic,
4,1jyddn36UN4tVsJGtaJfem,You Are the Best Thing,Ray LaMontagne,Gossip In The Grain,2008-10-13,69,acoustic,
...,...,...,...,...,...,...,...,...
91268,1DqYqVLxUsR6NLUH1ys3AU,If Anyone Falls - 2016 Remaster,Stevie Nicks,The Wild Heart (2016 Remastered),1983-06-10,36,,
91269,0Adyxuv3X9l2CtMt0OeY5M,Stop Draggin' My Heart Around (with Tom Petty ...,Stevie Nicks,Bella Donna (Deluxe Edition),2016-11-04,36,,
91270,48RJAYTcIXuBPg55EzbaEq,Stand Back - 2016 Remaster,Stevie Nicks,The Wild Heart (2016 Remastered),1983-06-10,37,,
91271,35I5lX6yE00YSu7PEgES54,Sleeping Angel (From Fast Times at Ridgemont H...,Stevie Nicks,Bella Donna (Deluxe Edition),2016-11-04,34,,


## Extract track features/get spotify recommendations to seed

In [49]:
# def add_audio_features_to_df(df):
#     """
#     Adds audio features to a DataFrame containing track information.

#     Args:
#     - df (pd.DataFrame): DataFrame with track information including 'track_id'.

#     Returns:
#     - df (pd.DataFrame): DataFrame with additional columns for audio features.
#     """
#     audio_features_list = []

#     # Fetch audio features for each track
#     for track_id in df['track_id']:
#         features = sp.audio_features(track_id)[0]  # Fetch audio features for the track
#         if features:
#             audio_features_list.append(features)
#         else:
#             audio_features_list.append({})

#     # Convert audio features list to DataFrame and merge with the original DataFrame
#     audio_features_df = pd.DataFrame(audio_features_list)
#     df_with_features = pd.concat([df.reset_index(drop=True), audio_features_df.reset_index(drop=True)], axis=1)

#     return df_with_features

# tracks_with_features_df = add_audio_features_to_df(tracks_df)
# tracks_with_features_df


ERROR:spotipy.client:Max Retries reached


SpotifyException: http status: 429, code:-1 - /v1/audio-features/?ids=1HMQmOWrkieKYWlFsjUP3D:
 Max Retries, reason: too many 429 error responses

In [20]:
# import time
# import logging

# # Set up basic logging
# logging.basicConfig(level=logging.WARNING)

# def add_audio_features_to_df(df, max_retries=3, wait_time=2):
#     """
#     Adds audio features to a DataFrame containing track information with error handling.

#     Args:
#     - df (pd.DataFrame): DataFrame with track information including 'track_id'.
#     - max_retries (int): Maximum number of retries before skipping a track.
#     - wait_time (int): Time to wait in seconds before retrying a request (in case of rate limit).

#     Returns:
#     - df (pd.DataFrame): DataFrame with additional columns for audio features.
#     """
#     audio_features_list = []

#     for index, track_id in enumerate(df['track_id']):
#         retries = 0
#         while retries < max_retries:
#             try:
#                 # Fetch audio features for the track
#                 features = sp.audio_features(track_id)[0]
#                 if features:
#                     audio_features_list.append(features)
#                 else:
#                     audio_features_list.append({})
#                 break  # Exit the retry loop if successful

#             except spotipy.exceptions.SpotifyException as e:
#                 # Handle rate limit (429 error)
#                 if e.http_status == 429:
#                     retries += 1
#                     # Get retry-after header to know how long to wait
#                     retry_after = int(e.headers.get('Retry-After', wait_time))
#                     logging.warning(f"Rate limited on track {track_id}, retrying in {retry_after} seconds...")
#                     time.sleep(retry_after)  # Sleep for the time specified by Spotify

#                 else:
#                     logging.error(f"Error fetching audio features for track {track_id}: {str(e)}")
#                     audio_features_list.append({})
#                     break  # Exit the retry loop if it's not a rate limit error

#         if retries == max_retries:
#             logging.warning(f"Max retries reached for track {track_id}, skipping...")
#             audio_features_list.append({})  # Add empty dict if we skip the track

#     # Convert the list of audio features to a DataFrame
#     audio_features_df = pd.DataFrame(audio_features_list)

#     # Concatenate the original DataFrame with the audio features DataFrame
#     df_with_features = pd.concat([df.reset_index(drop=True), audio_features_df.reset_index(drop=True)], axis=1)

#     return df_with_features

# # Example usage
# tracks_with_features_df = add_audio_features_to_df(tracks_df)
# tracks_with_features_df


import time
import logging
import pandas as pd

# Set up basic logging
logging.basicConfig(level=logging.WARNING)

def add_audio_features_to_df(df, max_retries=3, wait_time=2, save_filepath='audio_features_backup.csv'):
    """
    Adds audio features to a DataFrame containing track information with error handling.
    Saves the progress and returns what has been gathered so far if the rate limit is hit multiple times.

    Args:
    - df (pd.DataFrame): DataFrame with track information including 'track_id'.
    - max_retries (int): Maximum number of retries before skipping a track.
    - wait_time (int): Time to wait in seconds before retrying a request (in case of rate limit).
    - save_filepath (str): Filepath to save partial results in case of multiple 429 rate limit errors.

    Returns:
    - df_with_features (pd.DataFrame): DataFrame with additional columns for audio features.
    """
    audio_features_list = []

    for index, track_id in enumerate(df['track_id']):
        retries = 0
        while retries < max_retries:
            try:
                # Fetch audio features for the track
                print('track id:', track_id)
                features = sp.audio_features(track_id)[0]
                if features:
                    audio_features_list.append(features)
                else:
                    audio_features_list.append({})
                break  # Exit the retry loop if successful

            except spotipy.exceptions.SpotifyException as e:
                # Handle rate limit (429 error)
                if e.http_status == 429:
                    retries += 1
                    # Get retry-after header to know how long to wait
                    retry_after = int(e.headers.get('Retry-After', wait_time))
                    logging.warning(f"Rate limited on track {track_id}, retrying in {retry_after} seconds...")
                    time.sleep(retry_after)  # Sleep for the time specified by Spotify

                else:
                    logging.error(f"Error fetching audio features for track {track_id}: {str(e)}")
                    audio_features_list.append({})
                    break  # Exit the retry loop if it's not a rate limit error

        # If retries exhausted due to rate limiting, save progress and return
        if retries == max_retries:
            logging.warning(f"Max retries reached for track {track_id}, saving progress and stopping...")

            # Convert the list of audio features to a DataFrame
            audio_features_df = pd.DataFrame(audio_features_list)

            # Concatenate the original DataFrame with the audio features DataFrame
            df_with_features = pd.concat([df.iloc[:len(audio_features_df)].reset_index(drop=True), audio_features_df.reset_index(drop=True)], axis=1)

            # Save the progress to a CSV file
            df_with_features.to_csv(save_filepath, index=False)
            logging.info(f"Partial results saved to {save_filepath}")
            return df_with_features

    # Convert the list of audio features to a DataFrame
    audio_features_df = pd.DataFrame(audio_features_list)

    # Concatenate the original DataFrame with the audio features DataFrame
    df_with_features = pd.concat([df.reset_index(drop=True), audio_features_df.reset_index(drop=True)], axis=1)

    return df_with_features

# Example usage
tracks_with_features_df = add_audio_features_to_df(tracks_df, max_retries=3, wait_time=2, save_filepath='audio_features_backup.csv')
tracks_with_features_df



track id: 1HMQmOWrkieKYWlFsjUP3D
track id: 6uHvbKL0Yi37AuvNRmUfMw
track id: 7jIAttgQTpLDoNtykIQXjH
track id: 4E6cwWJWZw2zWf7VFbH7wf
track id: 1jyddn36UN4tVsJGtaJfem
track id: 3S0OXQeoh0w6AY8WQVckRW
track id: 1EzrEOXmMH3G43AXT1y7pA
track id: 5vjLSffimiIP26QG5WcN2K
track id: 53QF56cjZA9RTuuMZDrSA6
track id: 24CcvMOaNniXXcxA8HjUw1
track id: 6UIxGIqWlO5wsddY44AV1R
track id: 5KaN2M7seUV4uyNFp4twv6
track id: 6Uy6K3KdmUdAfelUp0SeXn
track id: 1MxHIIzcTeFnaNsDGY3pfj
track id: 38YgZVHPWOWsKrsCXz6JyP
track id: 1fEGtTZjrjJW8eUeewnNJR
track id: 5htghP7rThIe6oXBN6uYI5
track id: 2dm13KtTfScU9ObCzYzGyS
track id: 07m8PuXxxv5J4qPEDq6ZkK
track id: 0oOY4xChdxmGgpyQrY2FBz
track id: 6WuBXGHGpSqpkFCl7ei8xp
track id: 7aohwSiTDju51QmC54AUba
track id: 5auMzVHFr5Zfw6IbKarZ56
track id: 0RD3NWnHlyBCRwgNZy8QAn
track id: 7x4b0UccXSKBWxWmjcrG2T
track id: 6dwkFmLUz6hx2CZG2EqLol
track id: 0IktbUcnAGrvD03AWnz3Q8
track id: 0EKBV6GybPtALXUgWqWrym
track id: 0OiCqRaoKK6nuSIty8zHjl
track id: 1jHNQodYIGvk187xrISw1i
track id: 

ERROR:spotipy.client:Max Retries reached


track id: 65uoaqX5qcjXZRheAj1qQT


ERROR:spotipy.client:Max Retries reached


track id: 65uoaqX5qcjXZRheAj1qQT


ERROR:spotipy.client:Max Retries reached


Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,genre_source,playlist_source,danceability,energy,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,1HMQmOWrkieKYWlFsjUP3D,Bloom - Bonus Track,The Paper Kites,Woodland,2013-03-05,77,acoustic,,0.591,0.416,...,0.1120,0.415,96.003,audio_features,1HMQmOWrkieKYWlFsjUP3D,spotify:track:1HMQmOWrkieKYWlFsjUP3D,https://api.spotify.com/v1/tracks/1HMQmOWrkieK...,https://api.spotify.com/v1/audio-analysis/1HMQ...,210080,4
1,6uHvbKL0Yi37AuvNRmUfMw,Paint,The Paper Kites,Young North,2013-03-05,73,acoustic,,0.364,0.308,...,0.1230,0.387,73.328,audio_features,6uHvbKL0Yi37AuvNRmUfMw,spotify:track:6uHvbKL0Yi37AuvNRmUfMw,https://api.spotify.com/v1/tracks/6uHvbKL0Yi37...,https://api.spotify.com/v1/audio-analysis/6uHv...,191773,4
2,7jIAttgQTpLDoNtykIQXjH,Blister In The Sun,Violent Femmes,Violent Femmes,1983-01-01,72,acoustic,,0.726,0.537,...,0.0707,0.882,96.889,audio_features,7jIAttgQTpLDoNtykIQXjH,spotify:track:7jIAttgQTpLDoNtykIQXjH,https://api.spotify.com/v1/tracks/7jIAttgQTpLD...,https://api.spotify.com/v1/audio-analysis/7jIA...,145707,4
3,4E6cwWJWZw2zWf7VFbH7wf,Love Song,Sara Bareilles,Little Voice,2007-07-03,74,acoustic,,0.583,0.786,...,0.1880,0.573,123.055,audio_features,4E6cwWJWZw2zWf7VFbH7wf,spotify:track:4E6cwWJWZw2zWf7VFbH7wf,https://api.spotify.com/v1/tracks/4E6cwWJWZw2z...,https://api.spotify.com/v1/audio-analysis/4E6c...,258827,4
4,1jyddn36UN4tVsJGtaJfem,You Are the Best Thing,Ray LaMontagne,Gossip In The Grain,2008-10-13,69,acoustic,,0.578,0.727,...,0.1720,0.885,170.593,audio_features,1jyddn36UN4tVsJGtaJfem,spotify:track:1jyddn36UN4tVsJGtaJfem,https://api.spotify.com/v1/tracks/1jyddn36UN4t...,https://api.spotify.com/v1/audio-analysis/1jyd...,231840,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1748,4HrIZyZIqd7kqShDXEyN1n,Laranjinha,Wesley Safadão,Arrocha Safadão,2024-09-06,59,forro,,0.722,0.687,...,0.0656,0.923,137.992,audio_features,4HrIZyZIqd7kqShDXEyN1n,spotify:track:4HrIZyZIqd7kqShDXEyN1n,https://api.spotify.com/v1/tracks/4HrIZyZIqd7k...,https://api.spotify.com/v1/audio-analysis/4HrI...,175978,4
1749,4dGx53NJZJyuuLP5ownH1p,Mentira Estampada,Wesley Safadão,Arrocha Safadão,2024-09-06,60,forro,,0.704,0.773,...,0.0978,0.864,137.881,audio_features,4dGx53NJZJyuuLP5ownH1p,spotify:track:4dGx53NJZJyuuLP5ownH1p,https://api.spotify.com/v1/tracks/4dGx53NJZJyu...,https://api.spotify.com/v1/audio-analysis/4dGx...,173207,4
1750,1eyzqe2QqGZUmfcPZtrIyt,Midnight City,M83,"Hurry Up, We're Dreaming",2011,70,french,,0.526,0.712,...,0.1790,0.320,105.009,audio_features,1eyzqe2QqGZUmfcPZtrIyt,spotify:track:1eyzqe2QqGZUmfcPZtrIyt,https://api.spotify.com/v1/tracks/1eyzqe2QqGZU...,https://api.spotify.com/v1/audio-analysis/1eyz...,241440,4
1751,5ZduaRci3iNUiDfJbBfAaf,Give It To Me - Full Vocal Mix,Matt Sassari,Give It To Me (Full Vocal Mix),2021-10-22,83,french,,0.874,0.869,...,0.1640,0.726,126.027,audio_features,5ZduaRci3iNUiDfJbBfAaf,spotify:track:5ZduaRci3iNUiDfJbBfAaf,https://api.spotify.com/v1/tracks/5ZduaRci3iNU...,https://api.spotify.com/v1/audio-analysis/5Zdu...,102861,4


In [21]:
tracks_with_features_df

Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,genre_source,playlist_source,danceability,energy,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,1HMQmOWrkieKYWlFsjUP3D,Bloom - Bonus Track,The Paper Kites,Woodland,2013-03-05,77,acoustic,,0.591,0.416,...,0.1120,0.415,96.003,audio_features,1HMQmOWrkieKYWlFsjUP3D,spotify:track:1HMQmOWrkieKYWlFsjUP3D,https://api.spotify.com/v1/tracks/1HMQmOWrkieK...,https://api.spotify.com/v1/audio-analysis/1HMQ...,210080,4
1,6uHvbKL0Yi37AuvNRmUfMw,Paint,The Paper Kites,Young North,2013-03-05,73,acoustic,,0.364,0.308,...,0.1230,0.387,73.328,audio_features,6uHvbKL0Yi37AuvNRmUfMw,spotify:track:6uHvbKL0Yi37AuvNRmUfMw,https://api.spotify.com/v1/tracks/6uHvbKL0Yi37...,https://api.spotify.com/v1/audio-analysis/6uHv...,191773,4
2,7jIAttgQTpLDoNtykIQXjH,Blister In The Sun,Violent Femmes,Violent Femmes,1983-01-01,72,acoustic,,0.726,0.537,...,0.0707,0.882,96.889,audio_features,7jIAttgQTpLDoNtykIQXjH,spotify:track:7jIAttgQTpLDoNtykIQXjH,https://api.spotify.com/v1/tracks/7jIAttgQTpLD...,https://api.spotify.com/v1/audio-analysis/7jIA...,145707,4
3,4E6cwWJWZw2zWf7VFbH7wf,Love Song,Sara Bareilles,Little Voice,2007-07-03,74,acoustic,,0.583,0.786,...,0.1880,0.573,123.055,audio_features,4E6cwWJWZw2zWf7VFbH7wf,spotify:track:4E6cwWJWZw2zWf7VFbH7wf,https://api.spotify.com/v1/tracks/4E6cwWJWZw2z...,https://api.spotify.com/v1/audio-analysis/4E6c...,258827,4
4,1jyddn36UN4tVsJGtaJfem,You Are the Best Thing,Ray LaMontagne,Gossip In The Grain,2008-10-13,69,acoustic,,0.578,0.727,...,0.1720,0.885,170.593,audio_features,1jyddn36UN4tVsJGtaJfem,spotify:track:1jyddn36UN4tVsJGtaJfem,https://api.spotify.com/v1/tracks/1jyddn36UN4t...,https://api.spotify.com/v1/audio-analysis/1jyd...,231840,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1748,4HrIZyZIqd7kqShDXEyN1n,Laranjinha,Wesley Safadão,Arrocha Safadão,2024-09-06,59,forro,,0.722,0.687,...,0.0656,0.923,137.992,audio_features,4HrIZyZIqd7kqShDXEyN1n,spotify:track:4HrIZyZIqd7kqShDXEyN1n,https://api.spotify.com/v1/tracks/4HrIZyZIqd7k...,https://api.spotify.com/v1/audio-analysis/4HrI...,175978,4
1749,4dGx53NJZJyuuLP5ownH1p,Mentira Estampada,Wesley Safadão,Arrocha Safadão,2024-09-06,60,forro,,0.704,0.773,...,0.0978,0.864,137.881,audio_features,4dGx53NJZJyuuLP5ownH1p,spotify:track:4dGx53NJZJyuuLP5ownH1p,https://api.spotify.com/v1/tracks/4dGx53NJZJyu...,https://api.spotify.com/v1/audio-analysis/4dGx...,173207,4
1750,1eyzqe2QqGZUmfcPZtrIyt,Midnight City,M83,"Hurry Up, We're Dreaming",2011,70,french,,0.526,0.712,...,0.1790,0.320,105.009,audio_features,1eyzqe2QqGZUmfcPZtrIyt,spotify:track:1eyzqe2QqGZUmfcPZtrIyt,https://api.spotify.com/v1/tracks/1eyzqe2QqGZU...,https://api.spotify.com/v1/audio-analysis/1eyz...,241440,4
1751,5ZduaRci3iNUiDfJbBfAaf,Give It To Me - Full Vocal Mix,Matt Sassari,Give It To Me (Full Vocal Mix),2021-10-22,83,french,,0.874,0.869,...,0.1640,0.726,126.027,audio_features,5ZduaRci3iNUiDfJbBfAaf,spotify:track:5ZduaRci3iNUiDfJbBfAaf,https://api.spotify.com/v1/tracks/5ZduaRci3iNU...,https://api.spotify.com/v1/audio-analysis/5Zdu...,102861,4


In [18]:
# def get_track_features(track_ids):
#     """
#     Fetches audio features for a list of track IDs.

#     Args:
#     - track_ids (list): List of Spotify track IDs.

#     Returns:
#     - features (list): A list of dictionaries containing audio features for each track.
#     """
#     features = []

#     # Fetch audio features in batches
#     for i in range(0, len(track_ids), 100):  # 100 is the maximum batch size per request
#         audio_features = sp.audio_features(track_ids[i:i+100])
#         features.extend(audio_features)

#     return features

# # Example usage
# # track_ids = ['track_id1', 'track_id2', 'track_id3']  # Replace with actual track IDs
# track_features = get_track_features(large_track_id_array)
# track_features

[{'danceability': 0.7,
  'energy': 0.582,
  'key': 11,
  'loudness': -5.96,
  'mode': 0,
  'speechiness': 0.0356,
  'acousticness': 0.0502,
  'instrumentalness': 0,
  'liveness': 0.0881,
  'valence': 0.785,
  'tempo': 116.712,
  'type': 'audio_features',
  'id': '0WbMK4wrZ1wFSty9F7FCgu',
  'uri': 'spotify:track:0WbMK4wrZ1wFSty9F7FCgu',
  'track_href': 'https://api.spotify.com/v1/tracks/0WbMK4wrZ1wFSty9F7FCgu',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0WbMK4wrZ1wFSty9F7FCgu',
  'duration_ms': 218424,
  'time_signature': 4},
 {'danceability': 0.747,
  'energy': 0.507,
  'key': 2,
  'loudness': -10.171,
  'mode': 1,
  'speechiness': 0.0358,
  'acousticness': 0.2,
  'instrumentalness': 0.0608,
  'liveness': 0.117,
  'valence': 0.438,
  'tempo': 104.978,
  'type': 'audio_features',
  'id': '6dOtVTDdiauQNBQEDOtlAB',
  'uri': 'spotify:track:6dOtVTDdiauQNBQEDOtlAB',
  'track_href': 'https://api.spotify.com/v1/tracks/6dOtVTDdiauQNBQEDOtlAB',
  'analysis_url': 'https://api.sp

In [32]:
# def get_recommendations(seed_tracks, limit=10):
#     """
#     Fetches track recommendations based on seed tracks.

#     Args:
#     - seed_tracks (list): List of seed track IDs.
#     - limit (int): Number of recommendations to fetch.

#     Returns:
#     - recommendations (list): List of recommended track objects.
#     """
#     recommendations = sp.recommendations(seed_tracks=seed_tracks, limit=limit)
#     return recommendations['tracks']

# # Example usage
# recommendations = get_recommendations(tracks_with_features_df['track_id'])
# recommendations

def get_recommendations(seed_tracks, limit=10):
    """
    Fetches track recommendations based on seed tracks.

    Args:
    - seed_tracks (list): List of seed track IDs.
    - limit (int): Number of recommendations to fetch.

    Returns:
    - recommendations (list): List of recommended track objects.
    """
    recommendations = sp.recommendations(seed_tracks=seed_tracks, limit=limit)
    return recommendations['tracks']

# Example usage
seed_tracks_list = tracks_with_features_df['track_id'].tolist()  # Convert to list
spotify_recommendations = get_recommendations(seed_tracks_list[:5])  # Fetch recommendations using the first 5 track IDs as seeds
print(spotify_recommendations)


[{'album': {'album_type': 'ALBUM', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1QAJqy2dA3ihHBFIHRphZj'}, 'href': 'https://api.spotify.com/v1/artists/1QAJqy2dA3ihHBFIHRphZj', 'id': '1QAJqy2dA3ihHBFIHRphZj', 'name': 'Cigarettes After Sex', 'type': 'artist', 'uri': 'spotify:artist:1QAJqy2dA3ihHBFIHRphZj'}], 'available_markets': ['AR', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', 'KR', 'BD', 'PK', 'LK', 'GH', 'KE', 'NG', 'TZ', 'UG', 'AG', 'AM', 'BS', 'BB', 'BZ', 'BT', 'BW', 'BF', 'CV', 'CW',

In [35]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Step 1: Normalize audio features
scaler = MinMaxScaler()
tracks_with_normalized_features = tracks_with_features_df.copy()
numerical_columns = ['danceability', 'energy', 'tempo', 'acousticness', 'valence']
tracks_with_normalized_features[numerical_columns] = scaler.fit_transform(
    tracks_with_normalized_features[numerical_columns])

# Step 2: Fetch recommendations and build the target dataframe
recommendations = []
for track_id in tracks_with_normalized_features['track_id']:
    recs = get_recommendations([track_id], limit=5)  # Get 5 recommendations
    rec_track_ids = [track['id'] for track in recs]
    recommendations.append({'track_id': track_id, 'recommended_tracks': rec_track_ids})

recommendations_df = pd.DataFrame(recommendations)
recommendations_df

KeyboardInterrupt: 

## Build seed dataset

In [None]:
import pandas as pd

def build_dataset(seed_tracks, num_recommendations=20):
    """
    Builds a dataset of tracks and their features, including recommendations.

    Args:
    - seed_tracks (list): List of seed track IDs.
    - num_recommendations (int): Number of recommendations to fetch.

    Returns:
    - df (pd.DataFrame): A DataFrame containing track features and recommendations.
    """
    # Get initial track features for seed tracks
    seed_track_features = get_track_features(seed_tracks)

    # Get recommended tracks
    recommended_tracks = get_recommendations(seed_tracks, limit=num_recommendations)

    # Extract the IDs of the recommended tracks
    recommended_track_ids = [track['id'] for track in recommended_tracks]

    # Get audio features for recommended tracks
    recommended_track_features = get_track_features(recommended_track_ids)

    # Combine seed track features and recommended track features
    all_features = seed_track_features + recommended_track_features

    # Convert to DataFrame
    df = pd.DataFrame(all_features)

    return df

# Example usage
seed_tracks = ['track_id1', 'track_id2', 'track_id3']  # Replace with actual seed track IDs
df = build_dataset(seed_tracks)
print(df.head())


## Content-Based Filtering
Content-based filtering recommends songs based on features of the song itself (e.g., audio features). This method can be useful when starting with a general dataset and allows you to build a recommendation engine that compares items directly.

Steps:

Collect features such as tempo, energy, danceability, acousticness, valence, etc., using Spotify’s API.
Use these features to compute song similarity (e.g., cosine similarity).
Recommend songs that are closest to the song features using similarity metrics.

In [22]:
tracks_with_features_df

Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,genre_source,playlist_source,danceability,energy,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,1HMQmOWrkieKYWlFsjUP3D,Bloom - Bonus Track,The Paper Kites,Woodland,2013-03-05,77,acoustic,,0.591,0.416,...,0.1120,0.415,96.003,audio_features,1HMQmOWrkieKYWlFsjUP3D,spotify:track:1HMQmOWrkieKYWlFsjUP3D,https://api.spotify.com/v1/tracks/1HMQmOWrkieK...,https://api.spotify.com/v1/audio-analysis/1HMQ...,210080,4
1,6uHvbKL0Yi37AuvNRmUfMw,Paint,The Paper Kites,Young North,2013-03-05,73,acoustic,,0.364,0.308,...,0.1230,0.387,73.328,audio_features,6uHvbKL0Yi37AuvNRmUfMw,spotify:track:6uHvbKL0Yi37AuvNRmUfMw,https://api.spotify.com/v1/tracks/6uHvbKL0Yi37...,https://api.spotify.com/v1/audio-analysis/6uHv...,191773,4
2,7jIAttgQTpLDoNtykIQXjH,Blister In The Sun,Violent Femmes,Violent Femmes,1983-01-01,72,acoustic,,0.726,0.537,...,0.0707,0.882,96.889,audio_features,7jIAttgQTpLDoNtykIQXjH,spotify:track:7jIAttgQTpLDoNtykIQXjH,https://api.spotify.com/v1/tracks/7jIAttgQTpLD...,https://api.spotify.com/v1/audio-analysis/7jIA...,145707,4
3,4E6cwWJWZw2zWf7VFbH7wf,Love Song,Sara Bareilles,Little Voice,2007-07-03,74,acoustic,,0.583,0.786,...,0.1880,0.573,123.055,audio_features,4E6cwWJWZw2zWf7VFbH7wf,spotify:track:4E6cwWJWZw2zWf7VFbH7wf,https://api.spotify.com/v1/tracks/4E6cwWJWZw2z...,https://api.spotify.com/v1/audio-analysis/4E6c...,258827,4
4,1jyddn36UN4tVsJGtaJfem,You Are the Best Thing,Ray LaMontagne,Gossip In The Grain,2008-10-13,69,acoustic,,0.578,0.727,...,0.1720,0.885,170.593,audio_features,1jyddn36UN4tVsJGtaJfem,spotify:track:1jyddn36UN4tVsJGtaJfem,https://api.spotify.com/v1/tracks/1jyddn36UN4t...,https://api.spotify.com/v1/audio-analysis/1jyd...,231840,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1748,4HrIZyZIqd7kqShDXEyN1n,Laranjinha,Wesley Safadão,Arrocha Safadão,2024-09-06,59,forro,,0.722,0.687,...,0.0656,0.923,137.992,audio_features,4HrIZyZIqd7kqShDXEyN1n,spotify:track:4HrIZyZIqd7kqShDXEyN1n,https://api.spotify.com/v1/tracks/4HrIZyZIqd7k...,https://api.spotify.com/v1/audio-analysis/4HrI...,175978,4
1749,4dGx53NJZJyuuLP5ownH1p,Mentira Estampada,Wesley Safadão,Arrocha Safadão,2024-09-06,60,forro,,0.704,0.773,...,0.0978,0.864,137.881,audio_features,4dGx53NJZJyuuLP5ownH1p,spotify:track:4dGx53NJZJyuuLP5ownH1p,https://api.spotify.com/v1/tracks/4dGx53NJZJyu...,https://api.spotify.com/v1/audio-analysis/4dGx...,173207,4
1750,1eyzqe2QqGZUmfcPZtrIyt,Midnight City,M83,"Hurry Up, We're Dreaming",2011,70,french,,0.526,0.712,...,0.1790,0.320,105.009,audio_features,1eyzqe2QqGZUmfcPZtrIyt,spotify:track:1eyzqe2QqGZUmfcPZtrIyt,https://api.spotify.com/v1/tracks/1eyzqe2QqGZU...,https://api.spotify.com/v1/audio-analysis/1eyz...,241440,4
1751,5ZduaRci3iNUiDfJbBfAaf,Give It To Me - Full Vocal Mix,Matt Sassari,Give It To Me (Full Vocal Mix),2021-10-22,83,french,,0.874,0.869,...,0.1640,0.726,126.027,audio_features,5ZduaRci3iNUiDfJbBfAaf,spotify:track:5ZduaRci3iNUiDfJbBfAaf,https://api.spotify.com/v1/tracks/5ZduaRci3iNU...,https://api.spotify.com/v1/audio-analysis/5Zdu...,102861,4


In [23]:
list(tracks_with_features_df)

['track_id',
 'track_name',
 'artist_name',
 'album_name',
 'release_date',
 'popularity',
 'genre_source',
 'playlist_source',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'type',
 'id',
 'uri',
 'track_href',
 'analysis_url',
 'duration_ms',
 'time_signature']

### Using cosine-similarity to recommend with content-based filtering

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

# Example of fetching and comparing Spotify song features
song_features = tracks_with_features_df[['danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
#  'duration_ms',
 ]]
similarity_matrix = cosine_similarity(song_features)

# Recommend similar songs
def recommend_songs(song_id, similarity_matrix, df, top_n=10):
    song_index = df[df['track_id'] == song_id].index[0]
    similar_indices = similarity_matrix[song_index].argsort()[::-1][1:top_n+1]
    return df.iloc[similar_indices]

cosine_rec_df = recommend_songs('7BlvPctRnjjJUjBnrySJ7b', similarity_matrix, tracks_with_features_df)
cosine_rec_df

Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,genre_source,playlist_source,danceability,energy,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
1126,1dr9fQuYZ6NtGuVqdMOTw1,Romance,Varials,In Darkness,2019-10-11,55,death-metal,,0.661,0.692,...,0.162,0.0595,126.845,audio_features,1dr9fQuYZ6NtGuVqdMOTw1,spotify:track:1dr9fQuYZ6NtGuVqdMOTw1,https://api.spotify.com/v1/tracks/1dr9fQuYZ6Nt...,https://api.spotify.com/v1/audio-analysis/1dr9...,142520,4
606,3aawXLnJpcDVXCl1UUksAW,淘汰,Eason Chan,認了吧,2007-01-01,58,cantopop,,0.512,0.551,...,0.302,0.206,117.978,audio_features,3aawXLnJpcDVXCl1UUksAW,spotify:track:3aawXLnJpcDVXCl1UUksAW,https://api.spotify.com/v1/tracks/3aawXLnJpcDV...,https://api.spotify.com/v1/audio-analysis/3aaw...,285000,4
1161,6txvQu0zUbiqG24A8XMLnK,Miracle Maker,Dom Dolla,Miracle Maker,2022-07-14,66,deep-house,,0.609,0.94,...,0.15,0.238,128.001,audio_features,6txvQu0zUbiqG24A8XMLnK,spotify:track:6txvQu0zUbiqG24A8XMLnK,https://api.spotify.com/v1/tracks/6txvQu0zUbiq...,https://api.spotify.com/v1/audio-analysis/6txv...,188592,4
556,1c8gk2PeTE04A1pIDH9YMk,Rolling in the Deep,Adele,21,2011-01-24,79,british,,0.73,0.769,...,0.0473,0.507,104.948,audio_features,1c8gk2PeTE04A1pIDH9YMk,spotify:track:1c8gk2PeTE04A1pIDH9YMk,https://api.spotify.com/v1/tracks/1c8gk2PeTE04...,https://api.spotify.com/v1/audio-analysis/1c8g...,228093,4
964,5W4kiM2cUYBJXKRudNyxjW,You Proof,Morgan Wallen,One Thing At A Time,2023-03-03,82,country,,0.728,0.818,...,0.582,0.681,119.706,audio_features,5W4kiM2cUYBJXKRudNyxjW,spotify:track:5W4kiM2cUYBJXKRudNyxjW,https://api.spotify.com/v1/tracks/5W4kiM2cUYBJ...,https://api.spotify.com/v1/audio-analysis/5W4k...,157478,4
500,4ZtqsOdBbS6GoedzzRGSo9,Breathe,The Prodigy,The Fat of the Land,1997-06-30,53,breakbeat,,0.673,0.808,...,0.037,0.303,130.041,audio_features,4ZtqsOdBbS6GoedzzRGSo9,spotify:track:4ZtqsOdBbS6GoedzzRGSo9,https://api.spotify.com/v1/tracks/4ZtqsOdBbS6G...,https://api.spotify.com/v1/audio-analysis/4Ztq...,336280,4
1232,1BW0sbpZavICte8D22HKNW,Something Here for the Club - Instrumental,Terrence Parker,Something Here for the Club (Instrumental),2018-09-14,37,detroit-techno,,0.825,0.858,...,0.0321,0.906,130.03,audio_features,1BW0sbpZavICte8D22HKNW,spotify:track:1BW0sbpZavICte8D22HKNW,https://api.spotify.com/v1/tracks/1BW0sbpZavIC...,https://api.spotify.com/v1/audio-analysis/1BW0...,403191,4
311,3iubkenxO8JUJNp7phyVlb,My Meds Aren't Working,Dystopia,Dystopia,2008-04-19,52,black-metal,,0.288,0.792,...,0.209,0.16,137.077,audio_features,3iubkenxO8JUJNp7phyVlb,spotify:track:3iubkenxO8JUJNp7phyVlb,https://api.spotify.com/v1/tracks/3iubkenxO8JU...,https://api.spotify.com/v1/audio-analysis/3iub...,241625,4
1078,6YB6CK4Tsb0BgtxCEL9KlI,Give It Up to Me,Sean Paul,The Trinity,2005-09-27,64,dancehall,,0.855,0.674,...,0.0548,0.578,95.991,audio_features,6YB6CK4Tsb0BgtxCEL9KlI,spotify:track:6YB6CK4Tsb0BgtxCEL9KlI,https://api.spotify.com/v1/tracks/6YB6CK4Tsb0B...,https://api.spotify.com/v1/audio-analysis/6YB6...,242253,4
1565,4YIoQqE50AdyG4BQafCi3u,High You Are (Branchez Remix),What So Not,High You Are (Remixes),2013-10-25,61,electronic,,0.691,0.856,...,0.034,0.414,136.98,audio_features,4YIoQqE50AdyG4BQafCi3u,spotify:track:4YIoQqE50AdyG4BQafCi3u,https://api.spotify.com/v1/tracks/4YIoQqE50Ady...,https://api.spotify.com/v1/audio-analysis/4YIo...,213340,4


### Cosine Similarity, Weighted Cosine Similarity, K-Nearest Neighbors


In [32]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


def prepare_data(df):
    """
    Prepares the dataset for content-based filtering by one-hot encoding genre
    and normalizing the features.

    Args:
    - df (pd.DataFrame): The dataframe containing track data.

    Returns:
    - df (pd.DataFrame): The prepared dataframe with one-hot encoded genres
      and normalized features.
    """
    # One-hot encode the genre column
    # if 'genre_source' in df.columns:
    #     df = pd.get_dummies(df, columns=['genre_source'], prefix='genre')

    genre_encoder = OneHotEncoder()
    genre_encoded = genre_encoder.fit_transform(tracks_with_features_df[['genre']]).toarray()


    # Normalize the audio features and popularity
    features_to_normalize = ['popularity', 'danceability', 'energy', 'key', 'loudness',
                             'mode', 'speechiness', 'acousticness', 'instrumentalness',
                             'liveness', 'valence', 'tempo', 'duration_ms']

    df[features_to_normalize] = df[features_to_normalize].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

    return df

# Example usage:
tracks_prepared_df = prepare_data(tracks_with_features_df)
tracks_prepared_df

Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,playlist_source,danceability,energy,key,...,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,genre_electronic,genre_emo,genre_folk,genre_forro,genre_french
0,1HMQmOWrkieKYWlFsjUP3D,Bloom - Bonus Track,The Paper Kites,Woodland,2013-03-05,0.763441,,0.606776,0.415263,0.090909,...,False,False,False,False,False,False,False,False,False,False
1,6uHvbKL0Yi37AuvNRmUfMw,Paint,The Paper Kites,Young North,2013-03-05,0.720430,,0.373717,0.306756,0.363636,...,False,False,False,False,False,False,False,False,False,False
2,7jIAttgQTpLDoNtykIQXjH,Blister In The Sun,Violent Femmes,Violent Femmes,1983-01-01,0.709677,,0.745380,0.536832,0.000000,...,False,False,False,False,False,False,False,False,False,False
3,4E6cwWJWZw2zWf7VFbH7wf,Love Song,Sara Bareilles,Little Voice,2007-07-03,0.731183,,0.598563,0.787003,0.181818,...,False,False,False,False,False,False,False,False,False,False
4,1jyddn36UN4tVsJGtaJfem,You Are the Best Thing,Ray LaMontagne,Gossip In The Grain,2008-10-13,0.677419,,0.593429,0.727726,0.909091,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1748,4HrIZyZIqd7kqShDXEyN1n,Laranjinha,Wesley Safadão,Arrocha Safadão,2024-09-06,0.569892,,0.741273,0.687538,0.545455,...,False,False,False,False,False,False,False,False,True,False
1749,4dGx53NJZJyuuLP5ownH1p,Mentira Estampada,Wesley Safadão,Arrocha Safadão,2024-09-06,0.580645,,0.722793,0.773942,0.363636,...,False,False,False,False,False,False,False,False,True,False
1750,1eyzqe2QqGZUmfcPZtrIyt,Midnight City,M83,"Hurry Up, We're Dreaming",2011,0.688172,,0.540041,0.712655,1.000000,...,False,False,False,False,False,False,False,False,False,True
1751,5ZduaRci3iNUiDfJbBfAaf,Give It To Me - Full Vocal Mix,Matt Sassari,Give It To Me (Full Vocal Mix),2021-10-22,0.827957,,0.897331,0.870393,0.090909,...,False,False,False,False,False,False,False,False,False,True


#### Weighted Cosine Similarity

By adding additional weighting to certain features we can adjust the recommendations we get, by including key and speechinees, and adding weighting to popularity, genre seems to align more closely

In [33]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def weighted_cosine_similarity(df, weight_dict):
    """
    Computes a weighted cosine similarity matrix based on the given weights.

    Args:
    - df (pd.DataFrame): The dataframe containing track data.
    - weight_dict (dict): A dictionary of weights for each feature.

    Returns:
    - similarity_matrix (np.array): A matrix of weighted cosine similarities.
    """
    feature_columns = list(weight_dict.keys())

    # Apply weights to the features
    weighted_features = df[feature_columns] * np.array([weight_dict[feat] for feat in feature_columns])

    # Compute cosine similarity on the weighted features
    similarity_matrix = cosine_similarity(weighted_features)

    return similarity_matrix

# Example weights
feature_weights = {
    'popularity': 0.2,
    'danceability': 0.2,
    'energy': 0.2,
    'valence': 0.1,
    'loudness': 0.1,
    'key': 0.1,
    'speechiness': 0.1
    # Add other features if needed
    # 'popularity', 'danceability', 'energy', 'key', 'loudness',
    # 'mode', 'speechiness', 'acousticness', 'instrumentalness',
    # 'liveness', 'valence', 'tempo', 'duration_ms'
}

# Compute weighted similarity matrix
weighted_similarity_matrix = weighted_cosine_similarity(tracks_prepared_df, feature_weights)
weighted_similarity_matrix

array([[1.        , 0.97420876, 0.97572887, ..., 0.90640296, 0.96868938,
        0.95535937],
       [0.97420876, 1.        , 0.92914891, ..., 0.92758802, 0.91654641,
        0.96435619],
       [0.97572887, 0.92914891, 1.        , ..., 0.88078839, 0.98280735,
        0.93599592],
       ...,
       [0.90640296, 0.92758802, 0.88078839, ..., 1.        , 0.91957489,
        0.98092578],
       [0.96868938, 0.91654641, 0.98280735, ..., 0.91957489, 1.        ,
        0.94519759],
       [0.95535937, 0.96435619, 0.93599592, ..., 0.98092578, 0.94519759,
        1.        ]])

In [46]:
weighted_sim_rec_df = recommend_songs('7BlvPctRnjjJUjBnrySJ7b', weighted_similarity_matrix, tracks_prepared_df)
weighted_sim_rec_df


Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,playlist_source,danceability,energy,key,...,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,genre_electronic,genre_emo,genre_folk,genre_forro,genre_french
491,1erRDf98WX0VllciGPYO1H,Arise,Sepultura,Arise,1991-03-20,0.526882,,0.38193,0.962826,0.818182,...,False,False,False,False,False,False,False,False,False,False
1317,6tC2iHfUlzB2W4ntXXL2BH,Propane Nightmares,Pendulum,In Silico,2008-05-09,0.548387,,0.365503,0.96785,0.636364,...,True,False,False,False,False,False,False,False,False,False
324,0bKs1y9PTFBddM9qj0JGvb,Puritania,Dimmu Borgir,Puritanical Euphoric Misanthropia,2001-03-12,0.494624,,0.311088,0.970864,0.727273,...,False,False,False,False,False,False,False,False,False,False
1110,5OjCsHeByDYEGxMrb1z8KQ,Flying Whales,Gojira,From Mars to Sirius,2005-09-07,0.602151,,0.273101,0.926657,0.636364,...,False,False,False,False,False,False,False,False,False,False
1108,6W2KBpMMXW17r7dPyqi8Iu,Repentless,Slayer,Repentless,2015-09-11,0.623656,,0.410678,0.995981,0.727273,...,False,False,False,False,False,False,False,False,False,False
1139,6yerffT19n4aHyY25Rnkfq,Bratva,Slaughter to Prevail,Kostolom,2019,0.548387,,0.427105,0.993972,1.0,...,False,False,False,False,False,False,False,False,False,False
1326,2lN6G35gsXkA3xzPYqmis5,Self vs Self (feat. In Flames),Pendulum,Immersion,2010-05-21,0.526882,,0.449692,0.990958,0.909091,...,True,False,False,False,False,False,False,False,False,False
309,1PZ1po0vZzESv0AJCURC72,Sun//Eater,Lorna Shore,Pain Remains,2022-10-14,0.537634,,0.25154,0.890487,0.818182,...,False,False,False,False,False,False,False,False,False,False
1111,3zwdN4h7DNlGlm3w4KylOM,Remember Me,Currents,The Death We Seek,2023-05-05,0.569892,,0.290554,0.922638,0.909091,...,False,False,False,False,False,False,False,False,False,False
311,3iubkenxO8JUJNp7phyVlb,My Meds Aren't Working,Dystopia,Dystopia,2008-04-19,0.494624,,0.295688,0.793031,0.909091,...,False,False,False,False,False,False,False,False,False,False


#### K nearest neighbors

In [38]:
from sklearn.neighbors import NearestNeighbors

def knn_recommendation(df, weight_dict, n_neighbors=10):
    """
    Recommend songs using K-Nearest Neighbors based on weighted features.

    Args:
    - df (pd.DataFrame): The dataframe containing track data.
    - weight_dict (dict): A dictionary of weights for each feature.
    - n_neighbors (int): Number of neighbors (recommendations) to return.

    Returns:
    - knn_model: Trained KNN model.
    - distances: Distances of nearest neighbors.
    - indices: Indices of nearest neighbors in the dataframe.
    """
    feature_columns = list(feature_weights.keys())

    # Apply weights to the features
    weighted_features = df[feature_columns] * np.array([feature_weights[feat] for feat in feature_columns])

    # Initialize KNN model
    knn_model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')

    # Fit the KNN model on weighted features
    knn_model.fit(weighted_features)

    return knn_model

def get_knn_recommendations(song_id, df, knn_model, n_neighbors=10):
    """
    Fetch recommendations for a specific song using a pre-trained KNN model.

    Args:
    - song_id (str): The ID of the song for which recommendations are to be fetched.
    - df (pd.DataFrame): The dataframe containing track data.
    - knn_model: Trained KNN model.
    - n_neighbors (int): Number of recommendations to return.

    Returns:
    - recommendations (pd.DataFrame): DataFrame of recommended songs.
    """
    song_index = df[df['track_id'] == song_id].index[0]
    distances, indices = knn_model.kneighbors([df.iloc[song_index][list(feature_weights.keys())].values], n_neighbors=n_neighbors+1)

    # Get indices of the most similar songs (excluding the first one, which is the seed song itself)
    similar_indices = indices.flatten()[1:]

    return df.iloc[similar_indices]

# Example usage:
knn_model = knn_recommendation(tracks_prepared_df, feature_weights)
knn_recommendations = get_knn_recommendations('7BlvPctRnjjJUjBnrySJ7b', tracks_prepared_df, knn_model)
knn_recommendations




Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,playlist_source,danceability,energy,key,...,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,genre_electronic,genre_emo,genre_folk,genre_forro,genre_french
378,0H7ROR8cZTsh6YUgnlqjM2,Ghosts Of Mississippi,The Steeldrivers,Reckless,2010-01-01,0.516129,,0.328542,0.723707,1.0,...,False,False,False,False,False,False,False,False,False,False
311,3iubkenxO8JUJNp7phyVlb,My Meds Aren't Working,Dystopia,Dystopia,2008-04-19,0.494624,,0.295688,0.793031,0.909091,...,False,False,False,False,False,False,False,False,False,False
1429,1E0CZWim9mfwrCkXvieES8,Ecstasy Of Soul,Zeds Dead,Ecstasy Of Soul,2022-12-14,0.505376,,0.389117,0.89953,1.0,...,False,False,True,False,False,False,False,False,False,False
1383,1E0CZWim9mfwrCkXvieES8,Ecstasy Of Soul,Zeds Dead,Ecstasy Of Soul,2022-12-14,0.505376,,0.389117,0.89953,1.0,...,False,True,False,False,False,False,False,False,False,False
332,5fLHBxfx4JvgU65pkO74br,Step One,Make Them Suffer,How To Survive A Funeral,2020-06-19,0.419355,,0.389117,0.501668,1.0,...,False,False,False,False,False,False,False,False,False,False
1408,27YD36FUikKxbp3bKSiKGZ,First Time (feat. Dylan Matthew),Seven Lions,First Time (feat. Dylan Matthew),2018-10-12,0.537634,,0.433265,0.677491,1.0,...,False,False,True,False,False,False,False,False,False,False
1359,27YD36FUikKxbp3bKSiKGZ,First Time (feat. Dylan Matthew),Seven Lions,First Time (feat. Dylan Matthew),2018-10-12,0.537634,,0.433265,0.677491,1.0,...,False,True,False,False,False,False,False,False,False,False
317,3ffljpjKfPISFjCt4zpYUk,Hollowed Heart,Make Them Suffer,Hollowed Heart,2019-06-07,0.473118,,0.310062,0.986939,1.0,...,False,False,False,False,False,False,False,False,False,False
1332,2qxiyScLvjCMb9gg1hTl5F,Here For You,Wilkinson,Cognition,2022-02-11,0.591398,,0.367556,0.780975,1.0,...,True,False,False,False,False,False,False,False,False,False
1439,6BYzwbWg1Z2EB6VUXTYnhm,Worlds Away,Dabin,Worlds Away,2023-06-16,0.505376,,0.455852,0.827191,1.0,...,False,False,True,False,False,False,False,False,False,False


### Euclidean Distance

Description: Unlike cosine similarity, which measures the angle between two vectors, Euclidean distance measures the "straight-line" distance between points in a multi-dimensional space. It works best when features are normalized.


When to Use: Useful when you want to measure the actual distance between two feature sets, but it tends to be sensitive to scale, so normalization is important.

In [58]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import MinMaxScaler

weighted_similarity_features = list(feature_weights.keys())

def euclidean_similarity(df, feature_columns):
    # Normalize the features to a range of [0, 1]
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df[feature_columns])

    # Compute the Euclidean distance matrix
    distance_matrix = euclidean_distances(scaled_features)

    # Convert distances to similarity (lower distance = higher similarity)
    similarity_matrix = 1 / (1 + distance_matrix)

    return similarity_matrix

# Example usage:
euclidean_similarity_matrix = euclidean_similarity(tracks_with_features_df, weighted_similarity_features)
euclidean_similarity_matrix

euclidean_distance_rec_df = recommend_songs('7BlvPctRnjjJUjBnrySJ7b', euclidean_similarity_matrix, tracks_prepared_df)
euclidean_distance_rec_df

Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,playlist_source,danceability,energy,key,...,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,genre_electronic,genre_emo,genre_folk,genre_forro,genre_french
1317,6tC2iHfUlzB2W4ntXXL2BH,Propane Nightmares,Pendulum,In Silico,2008-05-09,0.548387,,0.365503,0.96785,0.636364,...,True,False,False,False,False,False,False,False,False,False
1110,5OjCsHeByDYEGxMrb1z8KQ,Flying Whales,Gojira,From Mars to Sirius,2005-09-07,0.602151,,0.273101,0.926657,0.636364,...,False,False,False,False,False,False,False,False,False,False
324,0bKs1y9PTFBddM9qj0JGvb,Puritania,Dimmu Borgir,Puritanical Euphoric Misanthropia,2001-03-12,0.494624,,0.311088,0.970864,0.727273,...,False,False,False,False,False,False,False,False,False,False
491,1erRDf98WX0VllciGPYO1H,Arise,Sepultura,Arise,1991-03-20,0.526882,,0.38193,0.962826,0.818182,...,False,False,False,False,False,False,False,False,False,False
331,7dQrAIlUHD9DpA3wUxpaDW,Apotheosis,Lorna Shore,Pain Remains,2022-10-14,0.451613,,0.217659,0.832215,0.818182,...,False,False,False,False,False,False,False,False,False,False
345,5wWRdIjndDOh1j4OXAPpdD,Deathcrush,Mayhem,Deathcrush,1993,0.462366,,0.224846,0.909577,0.636364,...,False,False,False,False,False,False,False,False,False,False
309,1PZ1po0vZzESv0AJCURC72,Sun//Eater,Lorna Shore,Pain Remains,2022-10-14,0.537634,,0.25154,0.890487,0.818182,...,False,False,False,False,False,False,False,False,False,False
1346,0CkmD2L3xMeZTXOwlOCsVm,"Infinity (feat. ILIRA, iiola & Tom Cane)",Wilkinson,"Infinity (feat. ILIRA, iiola & Tom Cane)",2023-03-31,0.548387,,0.436345,0.871398,0.818182,...,True,False,False,False,False,False,False,False,False,False
1403,49o6YjBAnjwPKLcXwIH53Z,Crawl Outta Love,ILLENIUM,Awake,2017-09-21,0.580645,,0.387064,0.723707,0.636364,...,False,False,True,False,False,False,False,False,False,False
1353,49o6YjBAnjwPKLcXwIH53Z,Crawl Outta Love,ILLENIUM,Awake,2017-09-21,0.580645,,0.387064,0.723707,0.636364,...,False,True,False,False,False,False,False,False,False,False


### Manhattan Distance

Description: Manhattan distance (or "L1 distance") measures the absolute sum of differences across features. This can sometimes perform better when features are sparse or have high variability.


When to Use: Good for high-dimensional spaces or sparse features

In [62]:
from sklearn.metrics.pairwise import manhattan_distances

def manhattan_similarity(df, feature_columns):
    # Normalize the features
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df[feature_columns])

    # Compute Manhattan distance matrix
    distance_matrix = manhattan_distances(scaled_features)

    # Convert distances to similarity
    similarity_matrix = 1 / (1 + distance_matrix)

    return similarity_matrix

# Example usage:
manhattan_similarity_matrix = manhattan_similarity(tracks_with_features_df, weighted_similarity_features)
manhattan_similarity_matrix

manhattan_similarity_df = recommend_songs('7BlvPctRnjjJUjBnrySJ7b', manhattan_similarity_matrix, tracks_prepared_df)
manhattan_similarity_df

Unnamed: 0,track_id,track_name,artist_name,album_name,release_date,popularity,playlist_source,danceability,energy,key,...,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,genre_electronic,genre_emo,genre_folk,genre_forro,genre_french
1317,6tC2iHfUlzB2W4ntXXL2BH,Propane Nightmares,Pendulum,In Silico,2008-05-09,0.548387,,0.365503,0.96785,0.636364,...,True,False,False,False,False,False,False,False,False,False
324,0bKs1y9PTFBddM9qj0JGvb,Puritania,Dimmu Borgir,Puritanical Euphoric Misanthropia,2001-03-12,0.494624,,0.311088,0.970864,0.727273,...,False,False,False,False,False,False,False,False,False,False
513,3m1JFLpKLYgMi1jf6Bu16P,Let Forever Be,The Chemical Brothers,Surrender,1999-01-01,0.526882,,0.258727,0.873408,0.727273,...,False,False,False,False,False,False,False,False,False,False
1110,5OjCsHeByDYEGxMrb1z8KQ,Flying Whales,Gojira,From Mars to Sirius,2005-09-07,0.602151,,0.273101,0.926657,0.636364,...,False,False,False,False,False,False,False,False,False,False
491,1erRDf98WX0VllciGPYO1H,Arise,Sepultura,Arise,1991-03-20,0.526882,,0.38193,0.962826,0.818182,...,False,False,False,False,False,False,False,False,False,False
331,7dQrAIlUHD9DpA3wUxpaDW,Apotheosis,Lorna Shore,Pain Remains,2022-10-14,0.451613,,0.217659,0.832215,0.818182,...,False,False,False,False,False,False,False,False,False,False
309,1PZ1po0vZzESv0AJCURC72,Sun//Eater,Lorna Shore,Pain Remains,2022-10-14,0.537634,,0.25154,0.890487,0.818182,...,False,False,False,False,False,False,False,False,False,False
1346,0CkmD2L3xMeZTXOwlOCsVm,"Infinity (feat. ILIRA, iiola & Tom Cane)",Wilkinson,"Infinity (feat. ILIRA, iiola & Tom Cane)",2023-03-31,0.548387,,0.436345,0.871398,0.818182,...,True,False,False,False,False,False,False,False,False,False
345,5wWRdIjndDOh1j4OXAPpdD,Deathcrush,Mayhem,Deathcrush,1993,0.462366,,0.224846,0.909577,0.636364,...,False,False,False,False,False,False,False,False,False,False
311,3iubkenxO8JUJNp7phyVlb,My Meds Aren't Working,Dystopia,Dystopia,2008-04-19,0.494624,,0.295688,0.793031,0.909091,...,False,False,False,False,False,False,False,False,False,False


### Pearson Correlation

Description: Pearson correlation measures the linear correlation between two sets of variables (features in this case). It gives a similarity score based on how the values of different features increase or decrease together.


When to Use: Best when you believe there is a linear relationship between features

In [68]:
def pearson_similarity(df, feature_columns):
    # Compute the Pearson correlation matrix
    correlation_matrix = df[feature_columns].corr().values

    # Convert correlation matrix to a similarity matrix (scale between [0, 1])
    similarity_matrix = (correlation_matrix + 1) / 2

    return similarity_matrix

# Example usage:
pearson_similarity_matrix = pearson_similarity(tracks_with_features_df, weighted_similarity_features)


### Jaccard Similarity
Description: Jaccard similarity measures the similarity between two sets by looking at the ratio of common features to total features. It's typically used for binary data but can be adapted to measure categorical or multi-label features, such as genre.


When to Use: Useful when working with binary or categorical data (like genre, mood tags, etc.).

In [70]:
from sklearn.metrics import jaccard_score

def jaccard_similarity(df, feature_columns):
    # Compute Jaccard similarity matrix for the one-hot encoded genre features
    genre_similarity_matrix = cosine_similarity(df[feature_columns])

    return genre_similarity_matrix

# Example usage with one-hot encoded genre columns:
one_hot_genres = pd.get_dummies(tracks_with_features_df['genre'])
tracks_with_genres_df = pd.concat([tracks_with_features_df, one_hot_genres], axis=1)
jaccard_similarity_matrix = jaccard_similarity(tracks_with_genres_df, one_hot_genres.columns)


NameError: name 'tracks_with_genres_df' is not defined

## Evaluation Metrics

####  Diversity

1 Diversity


Diversity ensures the recommendations aren’t too similar to each other. You can measure this by comparing the pairwise cosine similarities between the recommended tracks.

In [39]:
def compute_diversity(recommendations_df, feature_columns):
    """
    Computes diversity by calculating pairwise cosine similarity between recommended tracks.

    Args:
    - recommendations_df (pd.DataFrame): DataFrame of recommended tracks.
    - feature_columns (list): List of feature columns used for comparison.

    Returns:
    - diversity (float): Average pairwise cosine similarity (lower is more diverse).
    """
    features = recommendations_df[feature_columns]
    pairwise_similarities = cosine_similarity(features)

    # Take the upper triangle of the similarity matrix (excluding the diagonal) to avoid redundancy
    upper_tri_indices = np.triu_indices_from(pairwise_similarities, k=1)
    avg_similarity = np.mean(pairwise_similarities[upper_tri_indices])

    return 1 - avg_similarity  # A higher diversity is represented by lower average similarity


#### Coverage

Coverage measures the proportion of the entire dataset that your recommendation model can meaningfully recommend.

In [47]:
def compute_coverage(recommendations_df, total_df):
    """
    Computes coverage as the proportion of unique tracks recommended.

    Args:
    - recommendations_df (pd.DataFrame): DataFrame of recommended tracks.
    - total_df (pd.DataFrame): The entire dataset of tracks.

    Returns:
    - coverage (float): The percentage of tracks recommended relative to the dataset size.
    """
    unique_recommended_tracks = recommendations_df['track_id'].nunique()
    total_tracks = total_df['track_id'].nunique()

    return unique_recommended_tracks / total_tracks


#### Popularity Bias

Popularity bias measures how much the recommendations favor popular tracks.



In [48]:
def compute_popularity_bias(recommendations_df):
    """
    Computes the average popularity score of the recommended tracks.

    Args:
    - recommendations_df (pd.DataFrame): DataFrame of recommended tracks.

    Returns:
    - avg_popularity (float): Average popularity score.
    """
    return recommendations_df['popularity'].mean()


#### Artist Variety

This checks whether the recommendations come from a variety of different artists.

In [49]:
def compute_artist_variety(recommendations_df):
    """
    Computes the variety of artists in the recommended tracks.

    Args:
    - recommendations_df (pd.DataFrame): DataFrame of recommended tracks.

    Returns:
    - artist_variety (float): The proportion of unique artists in the recommendations.
    """
    unique_artists = recommendations_df['artist_name'].nunique()
    total_recommendations = len(recommendations_df)

    return unique_artists / total_recommendations


### Evaluate Recommendations Main Method

In [50]:
def evaluate_recommendations(recommendations_df, total_df, feature_columns):
    """
    Evaluates recommendations using various metrics such as diversity, coverage, popularity bias, and artist variety.

    Args:
    - recommendations_df (pd.DataFrame): DataFrame of recommended tracks.
    - total_df (pd.DataFrame): The entire dataset of tracks.
    - feature_columns (list): List of feature columns used for similarity comparison.

    Returns:
    - evaluation (dict): Dictionary containing evaluation metrics.
    """
    evaluation = {
        'diversity': compute_diversity(recommendations_df, feature_columns),
        'coverage': compute_coverage(recommendations_df, total_df),
        'popularity_bias': compute_popularity_bias(recommendations_df),
        'artist_variety': compute_artist_variety(recommendations_df),
    }

    return evaluation

In [54]:
cosine_similarity_features = [
    'danceability',
    'energy',
    'valence',
    'loudness',
    'key',
    'speechiness'
]


cos_sim_results = evaluate_recommendations(cosine_rec_df, tracks_prepared_df, cosine_similarity_features)
cos_sim_results

{'diversity': 0.0025058800751338595,
 'coverage': 0.006321112515802781,
 'popularity_bias': 60.7,
 'artist_variety': 1.0}

In [63]:
weighted_similarity_features = list(feature_weights.keys())

weighted_sim_results = evaluate_recommendations(weighted_sim_rec_df, tracks_prepared_df, weighted_similarity_features)
weighted_sim_results

{'diversity': 0.010983808793516237,
 'coverage': 0.006321112515802781,
 'popularity_bias': 0.5473118279569892,
 'artist_variety': 0.9}

In [64]:
knn_features = list(feature_weights.keys())

knn_results = evaluate_recommendations(knn_recommendations, tracks_prepared_df, weighted_similarity_features)
knn_results

{'diversity': 0.018833437707238443,
 'coverage': 0.0050568900126422255,
 'popularity_bias': 0.5086021505376344,
 'artist_variety': 0.7}

In [65]:
euclidean_features = list(feature_weights.keys())

euclidean_results = evaluate_recommendations(euclidean_distance_rec_df, tracks_prepared_df, weighted_similarity_features)
euclidean_results

{'diversity': 0.011061117766022699,
 'coverage': 0.005689001264222503,
 'popularity_bias': 0.5333333333333334,
 'artist_variety': 0.8}

In [67]:
manhattan_results = evaluate_recommendations(manhattan_similarity_df, tracks_prepared_df, weighted_similarity_features)
manhattan_results

{'diversity': 0.011568843278761287,
 'coverage': 0.006321112515802781,
 'popularity_bias': 0.5193548387096774,
 'artist_variety': 0.9}

## Collaborative Filtering
Collaborative filtering leverages user preferences across a large set of users, analyzing patterns of co-listened songs. It's one of the most popular techniques for recommendations and can be useful when you move toward incorporating user data.

Types:

User-based collaborative filtering: Recommends songs based on what similar users have enjoyed.
Item-based collaborative filtering: Recommends songs based on what similar songs were listened to by others.
Example (Using Matrix Factorization with surprise library):

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Assuming you have user-song ratings data (e.g., from Spotify listens)
reader = Reader(rating_scale=(1, 5))  # Scale depends on how you collect feedback
data = Dataset.load_from_df(df[['user_id', 'song_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

# Use SVD for matrix factorization
algo = SVD()
algo.fit(trainset)

predictions = algo.test(testset)
accuracy.rmse(predictions)


## Matrix Factorization (Latent Factor Models)
Matrix factorization aims to find hidden factors in the data, which can capture underlying patterns in how users interact with songs. This is effective for implicit data (e.g., listen counts instead of explicit ratings).

Approach:

Use techniques like SVD (Singular Value Decomposition) or ALS (Alternating Least Squares).
You can use implicit feedback (e.g., play count) to build a matrix and decompose it to find relationships between users and songs.

In [None]:
from implicit.als import AlternatingLeastSquares
import scipy.sparse as sp

# User-song interaction matrix
user_song_matrix = sp.coo_matrix((df['play_count'], (df['user_id'], df['song_id'])))

model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=30)
model.fit(user_song_matrix)

# Recommend songs to a user
recommendations = model.recommend(user_id, user_song_matrix[user_id], N=10)
