In [48]:
import pickle
import json
import pandas as pd
import numpy as np
import logging
from spotipy.exceptions import SpotifyException

#-----------------------------------------------#

import requests
import threading
import time
import os
from dotenv import load_dotenv

#-----------------------------------------------#

from spotipy.oauth2 import SpotifyOAuth
import spotipy
import spotipy.util as util

#-----------------------------------------------#

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import linear_kernel

In [49]:
load_dotenv()

True

In [56]:
'''Setting up a directory for caching Spotify data and initializig Spotify API credentials and user details using environment variables. 
It prepares for connecting to the Spotify API by defining the cache directory, client ID, client secret, redirect URI, and username.'''

CACHE_DIR = './spotify_data_cache'
cid = os.getenv('SPOTIPY_CLIENT_ID')
secret = os.getenv('SPOTIPY_CLIENT_SECRET')
redirect_uri = 'http://localhost:8080'
username = os.getenv('USERNAME')

def get_user_input():
    client_id = input("Enter your Spotify Client ID: ")
    client_secret = input("Enter your Spotify Client Secret: ")
    username = input("Enter your Spotify Username: ")
    return client_id, client_secret, username

client_id, client_secret, username = get_user_input()


In [57]:
''' This code initializes Spotify API authorization with specific scopes for reading user's top tracks and modifying playlists'''

scope = 'user-top-read, playlist-modify-public, playlist-modify-private'
auth_manager = SpotifyOAuth(client_id=cid, client_secret=secret, redirect_uri=redirect_uri, scope=scope, username=username)
sp = spotipy.Spotify(auth_manager=auth_manager, requests_timeout=10)

In [58]:
import json
import os

def get_top_tracks(sp, user_id, time_range='short_term', limit=20, CACHE_DIR='./spotify_data_cache'):
    '''
    Retrieves a Spotify user's top tracks, using a cache file for efficient data access.
    Manages caching for different users separately.
    '''
    CACHE_FILE = 'all_top_tracks.json'
    cache_file_path = os.path.join(CACHE_DIR, CACHE_FILE)
    os.makedirs(CACHE_DIR, exist_ok=True)

    # Cargar la caché existente, si está disponible
    if os.path.exists(cache_file_path):
        with open(cache_file_path, 'r') as cache_file:
            all_top_tracks = json.load(cache_file)
    else:
        all_top_tracks = {}

    # Comprobar si los datos del usuario ya están en caché
    if user_id in all_top_tracks and time_range in all_top_tracks[user_id]:
        return all_top_tracks[user_id][time_range]

    # Obtener los top tracks y actualizar la caché
    top_tracks = sp.current_user_top_tracks(time_range=time_range, limit=limit)
    all_top_tracks.setdefault(user_id, {})[time_range] = top_tracks

    with open(cache_file_path, 'w') as cache_file:
        json.dump(all_top_tracks, cache_file)

    return top_tracks

top_tracks = get_top_tracks(sp, username)


In [59]:
for i, item in enumerate(top_tracks['items']):
    print(i+1, item['name'], '//', item['artists'][0]['name'])

1 Si No Estás // iñigo quintero
2 QLONA // KAROL G
3 Fire On Fire // Sam Smith
4 MI EX TENÍA RAZÓN // KAROL G
5 Tamagotchi // GUNTTER
6 LALA // Myke Towers
7 Sin Tiempo Para Bailar // iñigo quintero
8 greedy // Tate McRae
9 Hey Mor // Ozuna
10 AMARGURA // KAROL G
11 In The Stars // Benson Boone
12 PERRO NEGRO // Bad Bunny
13 Según Quién // Maluma
14 WANDA // Quevedo
15 Columbia // Quevedo
16 Sobredosis // iñigo quintero
17 Déjala Que Vuelva (feat. Manuel Turizo) // Piso 21
18 BESO // ROSALÍA
19 QLONA // KAROL G
20 Hasta Que Dios Diga // Anuel AA


In [60]:
def create_tracks_dataframe(sp, top_tracks):
    '''
    This function processes a list of Spotify track data, extracting each track's ID, 
    name, and audio features, and then compiles this information into a Pandas DataFrame. 
    The DataFrame is indexed by track names for easy reference and analysis.
    '''

    tracks = top_tracks['items']
    track_ids = []  
    track_names = []
    features = []

    for track in tracks:
        track_id = track['id']
        track_name = track['name']
        audio_features = sp.audio_features(track_id)
        
        # Check if audio_features is not None
        if audio_features and len(audio_features) > 0:
            audio_features = audio_features[0]
        else:
            audio_features = None

        track_ids.append(track_id)
        track_names.append(track_name)
        features.append(audio_features)

    top_tracks_df = pd.DataFrame(features, index=track_names)
    
    return top_tracks_df, track_ids

In [61]:
'''Calling the previues function to create the new dataframe'''

top_tracks_df, track_ids = create_tracks_dataframe(sp, top_tracks)

In [62]:
def cleaning_df(track):
    
    '''The function cleaning_df takes a DataFrame track as input and returns a cleaned version, 
    top_tracks_clean, which contains only selected columns.'''
    
    top_tracks_clean = track[['id', 'danceability',	'energy',	'key',	'loudness',	'mode',	'speechiness',	'acousticness',	'instrumentalness',	'liveness',	'valence',	'tempo', 'duration_ms']]
    return top_tracks_clean

In [63]:
'''Calling the cleaning function to apply to the dataset'''

top_tracks_clean = cleaning_df(top_tracks_df)

In [64]:
def getting_id_artists(df):
    '''
    This function processes a DataFrame that contain Spotify track data, 
    and extracts the IDs of artists from its 'items' column. It prints the IDs and names 
    of the artists, and returns a list of these artist IDs.
    '''

    ids_artists = []
    print('|| Artists in my top 20: ||')
    print('===========================')

    for item in df['items']:
        artist_id = item['artists'][0]['id']
        artist_name = item['artists'][0]['name']
        print(f'{artist_id}: {artist_name}')
        ids_artists.append(artist_id)

    return ids_artists

In [65]:
ids_artists = getting_id_artists(top_tracks)

|| Artists in my top 20: ||
0jbo7KFNMiIkfBR6ih0yhm: iñigo quintero
790FomKkXshlbRYZFtlgla: KAROL G
2wY79sveU1sp5g7SokKOiI: Sam Smith
790FomKkXshlbRYZFtlgla: KAROL G
3psizJPIbIEEctInvdWSZk: GUNTTER
7iK8PXO48WeuP03g8YR51W: Myke Towers
0jbo7KFNMiIkfBR6ih0yhm: iñigo quintero
45dkTj5sMRSjrmBSBeiHym: Tate McRae
1i8SpTcr7yvPOmcqrbnVXY: Ozuna
790FomKkXshlbRYZFtlgla: KAROL G
22wbnEMDvgVIAGdFeek6ET: Benson Boone
4q3ewBCX7sLwd24euuV69X: Bad Bunny
1r4hJ1h58CWwUQe3MxPuau: Maluma
52iwsT98xCoGgiGntTiR7K: Quevedo
52iwsT98xCoGgiGntTiR7K: Quevedo
0jbo7KFNMiIkfBR6ih0yhm: iñigo quintero
4bw2Am3p9ji3mYsXNXtQcd: Piso 21
7ltDVBr6mKbRvohxheJ9h1: ROSALÍA
790FomKkXshlbRYZFtlgla: KAROL G
2R21vXR83lH98kGeO99Y66: Anuel AA


In [66]:
""" This code converts the ids_artists list to a set to remove any repeated IDs and then back to a list, displaying the count of unique artists.
"""

ids_artists = list(set(ids_artists))
print(f'Number of artists (without repetitions): {len(ids_artists)}')

Number of artists (without repetitions): 14


In [67]:
def get_similar_artists(sp, ids_artists, CACHE_DIR, CACHE_FILE='all_similar_artists.pickle'):
    '''
    This function takes a Spotify client 'sp' and a list of artist IDs 'ids_artists', finds similar 
    artists for each artist in the list, and prints their IDs and names. It utilizes a single .pickle file 
    caching system to efficiently store and access this data for all artists. The function returns a list of the IDs 
    of these similar artists.
    '''

    print('Similar Artists:')
    print('=====================')

    cache_file = os.path.join(CACHE_DIR, CACHE_FILE)

    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as file:
            all_similar_artists = pickle.load(file)
    else:
        all_similar_artists = {}

    ids_similar_artists = []

    for artist_id in ids_artists:
        if artist_id not in all_similar_artists:
            artists = sp.artist_related_artists(artist_id)['artists']
            all_similar_artists[artist_id] = artists
            with open(cache_file, 'wb') as file:
                pickle.dump(all_similar_artists, file)
        else:
            artists = all_similar_artists[artist_id]

        for item in artists:
            similar_artist_id = item['id']
            similar_artist_name = item['name']
            print(f'{similar_artist_id}: {similar_artist_name}')
            ids_similar_artists.append(similar_artist_id)
        
        time.sleep(1)
    return ids_similar_artists

In [68]:
ids_artists

['7iK8PXO48WeuP03g8YR51W',
 '3psizJPIbIEEctInvdWSZk',
 '22wbnEMDvgVIAGdFeek6ET',
 '1i8SpTcr7yvPOmcqrbnVXY',
 '4q3ewBCX7sLwd24euuV69X',
 '2R21vXR83lH98kGeO99Y66',
 '790FomKkXshlbRYZFtlgla',
 '0jbo7KFNMiIkfBR6ih0yhm',
 '45dkTj5sMRSjrmBSBeiHym',
 '7ltDVBr6mKbRvohxheJ9h1',
 '4bw2Am3p9ji3mYsXNXtQcd',
 '2wY79sveU1sp5g7SokKOiI',
 '1r4hJ1h58CWwUQe3MxPuau',
 '52iwsT98xCoGgiGntTiR7K']

In [69]:
'''Calling the get_similar_artists function to create a new list'''

ids_similar_artists = get_similar_artists(sp, ids_artists, CACHE_DIR)

Similar Artists:
6nVcHLIgY5pE2YCl8ubca1: Jhayco
0Q8NcsJwoCbZOHHW63su5S: Mora
5XJDexmWFLWOkjOEjOVX3e: Eladio Carrion
77ziqFxp5gaInVrF2lj4ht: Sech
6w9ToX5slZ4uIdmD17hJ3c: Bryant Myers
52iwsT98xCoGgiGntTiR7K: Quevedo
0KPX4Ucy9dk82uj4GpKesn: Dalex
1mcTU81TzQhprhouKaTkpq: Rauw Alejandro
2cPqdH7XMvwaBJEVjheH8g: Jay Wheeler
2O8vbr4RYPpk6MRA4fio7u: Saiko
2LRoIwlKmHjgvigdNGBHNo: Feid
14zUHaJZo1mnYtn6IBRaRP: Justin Quiles
4SsVbpTthjScTS7U2hmr1X: Arcángel
47MpMsUfWtgyIIBEFOr4FE: Lunay
2DspEsT7UXGKd2VaaedgG4: Alex Rose
1TtXnWcUs0FCkaZDPGYHdf: Darell
1pf0MPKfKdvS8J779mS1Ay: Miky Woodz
3RtNN1VnooWEn3KQk03DUL: Noriel
0GM7qgcRCORpGnfcN2tCiB: Tainy
12vb80Km0Ew53ABfJOepVz: Ñengo Flow
06n23qw408xNUMGETWsmaL: HAMLIT SHORTY
3gvuPnQphwrMjq23LTUAFN: Sticks House
2LCB3Ikxw5T3vnigAVaMXy: Little Brian
3Wck1E7AcfrcW3beEPtPaU: Erreflexx
2C1Pj5TMq8QjqbwQ4tmDtO: Punto40
1PANUPmT5KUDGs3Xgkhw3T: Martinwhite
4kcrrhbTk7KqY1KKeonwKA: Smi-Lee
4tXFiCOQKSg5avRjHnYJAb: Profeta Yao Yao
7DXregrznS25AM30UY9sUU: King Savagge
5n

In [70]:
ids_artists.extend(ids_similar_artists)

In [71]:
ids_artists = list(set(ids_artists))
print(f'Number of artists (without repetitions): {len(ids_artists)}')

Number of artists (without repetitions): 187


In [72]:
def get_new_releases(sp, limit=20):
    '''
    This function uses a Spotify client 'sp' to fetch new releases. It uses a .pickle file in the 
    'spotify_data_cache' directory as a cache to store and efficiently access the data. 
    It returns the album data with a default limit that can be adjusted.
    '''

    cache_file = os.path.join(CACHE_DIR, 'new_releases_cache.pickle')

    if os.path.exists(cache_file):
        try:
            with open(cache_file, 'rb') as file:
                new_releases = pickle.load(file)
            return new_releases
        except Exception as e:
            print(f"Error reading from the cache: {e}")

    try:
        new_releases = sp.new_releases(limit=limit)['albums']
        with open(cache_file, 'wb') as file:
            pickle.dump(new_releases, file)
        return new_releases
    except Exception as e:
        print(f"Error fetching new releases: {e}")
        return None

new_releases_data = get_new_releases(sp)


In [73]:
def getting_id_artists(df):

    print('')
    print('Artists with new releases')
    print('=====================')
    for item in df['items']:
        artist_id = item['artists'][0]['id']
        artist_name = item['artists'][0]['name']
        album_name = item['name']  
        release_date = item['release_date']
        print(f'{artist_id}: {artist_name} - // {album_name}, {release_date}')
        ids_artists.append(artist_id)
    return ids_artists

In [74]:
ids_artists = getting_id_artists(new_releases_data)


Artists with new releases
1Mw40k757jZuiL0NIJpdO5: GULEED - // Cuando Menos Lo Espera, 2023-07-07
6k8mwkKJKKjBILo7ypBspl: Ana Mena - // bellodrama, 2023-03-24
7iK8PXO48WeuP03g8YR51W: Myke Towers - // LA VIDA ES UNA, 2023-03-23
5XJDexmWFLWOkjOEjOVX3e: Eladio Carrion - // 3MEN2 KBRN, 2023-03-17
790FomKkXshlbRYZFtlgla: KAROL G - // MAÑANA SERÁ BONITO, 2023-02-24
2auC28zjQyVTsiZKNgPRGs: RM - // Indigo, 2022-12-02
6KImCVD70vtIoJWnq6nGn3: Harry Styles - // Harry's House, 2022-05-20
4q3ewBCX7sLwd24euuV69X: Bad Bunny - // Un Verano Sin Ti, 2022-05-06
2R21vXR83lH98kGeO99Y66: Anuel AA - // Las Leyendas Nunca Mueren, 2021-11-26
4dpARuHxo51G3z768sgnrY: Adele - // 30, 2021-11-19
6eUKZXaKkcviH0Ku9w2n3V: Ed Sheeran - // =, 2021-10-29
4gzpq5DPGxSnKTe4SA8HAU: Coldplay - // Music Of The Spheres, 2021-10-15
53KwLdlmrlCelAZMaLVZqU: James Blake - // Friends That Break Your Heart, 2021-10-08
4MzJMcHQBl9SIYSjwWn8QW: Spiritbox - // Eternal Blue, 2021-09-17
1vyhD5VmyZ7KMfW5gqLgo5: J Balvin - // JOSE, 2021-09-1

In [75]:
ids_artists = list(set(ids_artists))
print(f'Number of artists (without repetitions): {len(ids_artists)}')

Number of artists (without repetitions): 199


In [78]:
def get_album_ids(sp, ids_artists, CACHE_DIR='./spotify_data_cache', CACHE_FILE='artist_albums_cache.pickle'):
    '''
    Retrieves album IDs for each artist in 'ids_artists' using Spotify client 'sp'.
    Utilizes caching and a 3-second delay to avoid rate limiting.
    Returns a list of album IDs.
    '''

    os.makedirs(CACHE_DIR, exist_ok=True)
    cache_file_path = os.path.join(CACHE_DIR, CACHE_FILE)

    # Load cache if exists
    if os.path.exists(cache_file_path):
        with open(cache_file_path, 'rb') as file:
            cached_data = pickle.load(file)
    else:
        cached_data = {}

    id_albums = []
    nartists = len(ids_artists)

    for i, id_artist in enumerate(ids_artists):
        print(f'Processing artist {i+1} of {nartists}...')

        if id_artist in cached_data:
            id_albums.extend(cached_data[id_artist])
        else:
            try:
                albums = sp.artist_albums(id_artist, limit=1)  # to avoid having a huge list
                album_ids = [album['id'] for album in albums['items']]
                id_albums.extend(album_ids)
                cached_data[id_artist] = album_ids

                time.sleep(1)  # Adds 3-seconds delay

                # Save updated cache when new data is retrieved
                with open(cache_file_path, 'wb') as file:
                    pickle.dump(cached_data, file)

            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:
                    retry_after = int(e.response.headers.get('Retry-After', 3))
                    print(f"Rate limit exceeded, waiting for {retry_after} seconds.")
                    time.sleep(retry_after)
                    continue
                else:
                    print(f"Error processing artist {id_artist}: {e}")

    print('Done!')
    return id_albums


In [79]:
id_albums = get_album_ids(sp, ids_artists)

Processing artist 1 of 199...
Processing artist 2 of 199...
Processing artist 3 of 199...
Processing artist 4 of 199...
Processing artist 5 of 199...
Processing artist 6 of 199...
Processing artist 7 of 199...
Processing artist 8 of 199...
Processing artist 9 of 199...
Processing artist 10 of 199...
Processing artist 11 of 199...
Processing artist 12 of 199...
Processing artist 13 of 199...
Processing artist 14 of 199...
Processing artist 15 of 199...
Processing artist 16 of 199...
Processing artist 17 of 199...
Processing artist 18 of 199...
Processing artist 19 of 199...
Processing artist 20 of 199...
Processing artist 21 of 199...
Processing artist 22 of 199...
Processing artist 23 of 199...
Processing artist 24 of 199...
Processing artist 25 of 199...
Processing artist 26 of 199...
Processing artist 27 of 199...
Processing artist 28 of 199...
Processing artist 29 of 199...
Processing artist 30 of 199...
Processing artist 31 of 199...
Processing artist 32 of 199...
Processing artist

In [81]:
def get_track_ids(sp, id_albums, CACHE_DIR='./spotify_data_cache', CACHE_FILE='album_tracks_cache.pickle'):
    '''
    Retrieves track IDs for each album in 'id_albums' using Spotify client 'sp'.
    Utilizes a pickle file for caching. Adds new songs to cache without overwriting existing data.
    '''

    os.makedirs(CACHE_DIR, exist_ok=True)
    cache_file_path = os.path.join(CACHE_DIR, CACHE_FILE)
    cache_updated = False

    # Load cache if available
    if os.path.exists(cache_file_path):
        with open(cache_file_path, 'rb') as file:
            cached_data = pickle.load(file)
    else:
        cached_data = {}

    id_tracks = []
    for i, id_album in enumerate(id_albums):
        print(f'Processing album {i+1} of {len(id_albums)}...')
        cached_album_tracks = cached_data.get(id_album, [])

        try:
            album_tracks = sp.album_tracks(id_album, limit=3)['items']
            new_track_ids = [track['id'] for track in album_tracks if track['id'] not in cached_album_tracks]

            if new_track_ids:
                cached_data[id_album] = cached_album_tracks + new_track_ids
                cache_updated = True

            id_tracks.extend(cached_album_tracks + new_track_ids)
            time.sleep(1)  # Adding delay

        except requests.exceptions.HTTPError as e:
            handle_http_error(e)

    # Save updated cache
    if cache_updated:
        with open(cache_file_path, 'wb') as file:
            pickle.dump(cached_data, file)

    print(f'Done! Total number of pre-candidate tracks: {len(id_tracks)}')
    return id_tracks

def handle_http_error(e):
    if e.response.status_code == 429:
        retry_after = int(e.response.headers.get('Retry-After', 2))
        print(f"Rate limit exceeded, waiting for {retry_after} seconds.")
        time.sleep(retry_after)
    else:
        print(f"HTTP Error: {e}")

In [82]:
id_tracks = get_track_ids(sp, id_albums)

Processing album 1 of 398...
Processing album 2 of 398...
Processing album 3 of 398...
Processing album 4 of 398...
Processing album 5 of 398...
Processing album 6 of 398...
Processing album 7 of 398...
Processing album 8 of 398...
Processing album 9 of 398...
Processing album 10 of 398...
Processing album 11 of 398...
Processing album 12 of 398...
Processing album 13 of 398...
Processing album 14 of 398...
Processing album 15 of 398...
Processing album 16 of 398...
Processing album 17 of 398...
Processing album 18 of 398...
Processing album 19 of 398...
Processing album 20 of 398...
Processing album 21 of 398...
Processing album 22 of 398...
Processing album 23 of 398...
Processing album 24 of 398...
Processing album 25 of 398...
Processing album 26 of 398...
Processing album 27 of 398...
Processing album 28 of 398...
Processing album 29 of 398...
Processing album 30 of 398...
Processing album 31 of 398...
Processing album 32 of 398...
Processing album 33 of 398...
Processing album 34

In [84]:
def saving_function(ids, csv_file_path):
    '''
    Appends new track IDs to a CSV file, ensuring no duplicates.
    '''
    try:
        # Check if the CSV file exists and read it
        existing_data = pd.read_csv(csv_file_path)
        existing_ids = set(existing_data['track_id'].tolist())
    except FileNotFoundError:
        # File does not exist, create an empty set
        existing_ids = set()

    # Convert id_tracks to a set for efficient comparison
    new_ids_set = set(ids)

    # Find new IDs that are not in the existing data
    new_ids_to_append = new_ids_set - existing_ids

    if new_ids_to_append:
        # Append new data and save
        new_data = pd.DataFrame({'track_id': list(new_ids_to_append)})
        new_data.to_csv(csv_file_path, mode='a', header=not existing_ids, index=False)
        print(f"Appended new data to the CSV file at {csv_file_path}")
    else:
        print("No new data to append.")

name= 'id_tracks'
saving_function(id_tracks, f'./data/{name}.csv')


Appended new data to the CSV file at ./data/id_tracks.csv


In [105]:
import os
import pickle
import pandas as pd
import time

def get_track_details_and_features(sp, id_tracks, CACHE_DIR='./spotify_data_cache', CACHE_FILE='track_features_cache.pickle'):
    '''
    Retrieves track details and audio features for each track in 'id_tracks' using Spotify client 'sp'.
    Utilizes a pickle file for caching and includes error handling for rate limits.
    Returns a DataFrame with track names and features.
    '''

    os.makedirs(CACHE_DIR, exist_ok=True)
    cache_file_path = os.path.join(CACHE_DIR, CACHE_FILE)

    if os.path.exists(cache_file_path):
        with open(cache_file_path, 'rb') as file:
            cached_data = pickle.load(file)
    else:
        cached_data = {}

    track_names = []
    features = []
    ntracks = len(id_tracks)

    for i, track_id in enumerate(id_tracks):
        # print(f'Processing track {i+1} of {ntracks}...')

        if track_id not in cached_data:
            track_details = sp.track(track_id)
            track_name = track_details['name']
            print(track_name)
            audio_features = sp.audio_features(track_id)

            if audio_features[0] is not None:
                cached_data[track_id] = {'name': track_name, 'features': audio_features[0]}
                with open(cache_file_path, 'wb') as file:
                    pickle.dump(cached_data, file)

            time.sleep(1)  # Adds a 1-second delay to avoid rate limits

        if track_id in cached_data:
            track_names.append(cached_data[track_id]['name'])
            features.append(cached_data[track_id]['features'])

    print('Done!')
    candidates_df = pd.DataFrame(features, index=track_names)
    
    candidates_clean = cleaning_df(candidates_df)

    csv_file_path = './spotify_data_cache/candidates.csv'
    if os.path.exists(csv_file_path):
        existing_data = pd.read_csv(csv_file_path)
        updated_data = pd.concat([existing_data, candidates_clean]).drop_duplicates()
    else:
        updated_data = candidates_clean

    updated_data.to_csv(csv_file_path, index=False)
    print("CSV file updated.")

    return updated_data


In [106]:
updated_data  = get_track_details_and_features(sp, id_tracks)

Processing track 1 of 1017...
Processing track 2 of 1017...
Processing track 3 of 1017...
Processing track 4 of 1017...
Processing track 5 of 1017...
Processing track 6 of 1017...
Processing track 7 of 1017...
Processing track 8 of 1017...
Processing track 9 of 1017...
Processing track 10 of 1017...
Processing track 11 of 1017...
Processing track 12 of 1017...
Processing track 13 of 1017...
Processing track 14 of 1017...
Processing track 15 of 1017...
Processing track 16 of 1017...
Processing track 17 of 1017...
Processing track 18 of 1017...
Processing track 19 of 1017...
Processing track 20 of 1017...
Processing track 21 of 1017...
Processing track 22 of 1017...
Processing track 23 of 1017...
Processing track 24 of 1017...
Processing track 25 of 1017...
Processing track 26 of 1017...
Processing track 27 of 1017...
Processing track 28 of 1017...
Processing track 29 of 1017...
Processing track 30 of 1017...
Processing track 31 of 1017...
Processing track 32 of 1017...
Processing track 

ERROR:spotipy.client:Max Retries reached


SpotifyException: http status: 429, code:-1 - /v1/audio-features/?ids=0EsxDFcoqQHBUNAQhzWyHX:
 Max Retries, reason: too many 429 error responses

In [None]:
name = 'candidates'
saving_function(updated_data, f'./data/{name}.csv')


In [None]:
'''Calling the cleaning function to apply to the dataset'''

candidates = cleaning_df(updated_data)

In [None]:
top_tracks_clean_mtx = top_tracks_clean.iloc[:,1:].values
candidatos_mtx = candidates.iloc[:,1:].values

In [None]:
scaler = StandardScaler()
t20_scaled = scaler.fit_transform(top_tracks_clean_mtx)
can_scaled = scaler.fit_transform(candidatos_mtx)
t20_norm = np.sqrt((t20_scaled*t20_scaled).sum(axis=1))
can_norm = np.sqrt((can_scaled*can_scaled).sum(axis=1))
nt20 = t20_scaled.shape[0]
ncan = can_scaled.shape[0]
t20 = t20_scaled/t20_norm.reshape(nt20,1)
can = can_scaled/can_norm.reshape(ncan,1)
cos_sim = linear_kernel(t20,can)
cos_sim.shape

(20, 398)

In [None]:
def obtener_candidatos(pos, cos_sim, ncands, umbral = 0.5):
    
    # Obtener todas las pistas candidatas por encima de umbral
    
    idx = np.where(cos_sim[pos,:]>=umbral)[0] 
    
    # Y organizarlas de forma descendente (por similitudes de mayor a menor)
    idx = idx[np.argsort(cos_sim[pos,idx])[::-1]] # [::-1] porque por defecto argsort organiza de manera ascendente

    # Si hay más de "ncands", retornar máximo "ncands"
    if len(idx) >= ncands:
        cands = idx[0:ncands]
    else:
        cands = idx
  
    return cands

In [None]:
ids_t20 = []
ids_playlist = []

for i in range(top_tracks_clean.shape[0]):
    print(top_tracks_clean.index[i])   
    ids_t20.append(top_tracks_clean['id'][i])
    
    # Obtener listado de candidatos para esta pista
    cands = obtener_candidatos(i, cos_sim, 5, umbral=0.8)
    
    # Si hay pistas relacionadas obtener los ids correspondientes
    # e imprimir en pantalla
    if len(cands)==0:
        print('     ***No se encontraron pistas relacionadas***')
    else:
        # Obtener los ids correspondientes e imprimir en pantalla
        for j in cands:
            id_cand = candidates['id'][j]
            ids_playlist.append(id_cand)
            
            print(f'   {candidates.index[j]}')

Si No Estás
   Maldita
   Hermosa Flor - Live
   Parte de Mí
   Sin Avisar
   Hope You're Proud
QLONA
   ENIGMA
Fire On Fire
   Start
MI EX TENÍA RAZÓN
   Gladiador
Tamagotchi
     ***No se encontraron pistas relacionadas***
LALA
     ***No se encontraron pistas relacionadas***
Sin Tiempo Para Bailar
   Sin Tiempo Para Bailar
   Sin Tiempo Para Bailar
   CELEBRÉ
   Virginia Beach
   Virginia Beach
greedy
     ***No se encontraron pistas relacionadas***
Hey Mor
     ***No se encontraron pistas relacionadas***
AMARGURA
   Queriéndote
   Queriéndote
   Más Fuerte
In The Stars
   Movies
   Mil y una noches
   Música para bailar sobre el agua
   ATM
PERRO NEGRO
     ***No se encontraron pistas relacionadas***
Según Quién
   Peligrosa
   It Is What It Is - Acoustic
   Magnum
   Apretaito
WANDA
   Gladiador
Columbia
   The Path
   The Path
   ATM
   Qué Gano Olvidándote
Sobredosis
   hill that i'll die on
   Sad Forever
   MEDIA LUNA
   Horizon - from One Night In Malibu
   Intro - Prod. Jam 

  ids_t20.append(top_tracks_clean['id'][i])
  id_cand = candidates['id'][j]


In [None]:
ids_playlist_dep = [x for x in ids_playlist if x not in ids_t20]
ids_playlist_dep = list(set(ids_playlist_dep))

In [None]:
ids_playlist_dep


['6onjYVJaW2UMu3sGRzm0In',
 '3eP13S8D5m2cweMEg3ZDed',
 '69Api7PDVnQ1jQmSJlZiGC',
 '4O6jCUU0Un55TZ9FfZWSJh',
 '0Ckmi4rrdUWnlg4T9aIGCS',
 '6FH6fmlh9DbvssuEQyQEVd',
 '3AsbVi6XAEGXoKOTfhzevi',
 '3T9wo7kdEAOy1h5h7kh5dw',
 '4GoFt15vBKmQXqB3IfFzy7',
 '1iPPvZimh7slFBPE3Oe4pe',
 '3UunIIJTjCNhLkJ3HXzAAx',
 '6PLihys6LLwbCI9GRByWZ0',
 '0g8DBFUhszRFIUp6lSZIB4',
 '2XMXsM29VG71gcKokMtsK5',
 '5fDMDKHhVCpUmAvTPJrYNh',
 '37DLwjAM682PZr9a4dJ5VA',
 '3dG6YbDEyxKnIt0K8xYQSI',
 '2sY92LRATo3fwPzmDo0wwt',
 '5qf8lBMF49mfNILgF1TYTG',
 '3qPZbR73DQm1ZNrkXvuJ4B',
 '63jCeAYGLcRL21xm0jKlgn',
 '1tNpEy5ETYG6GbqHSGrmU3',
 '6YV2AI87l1n2fzqU8Dyo05',
 '68C2QbQUPIoTLDFJPZppCX',
 '7c8FD6rZJerQ1q1djZuQmA',
 '0LFxsB0fdwNIjfUtVpvAmq',
 '7DSAEUvxU8FajXtRloy8M0',
 '7iXRsjttPa62GpOcqsx1aE',
 '5Ov0CcRjy72vbMFJfyVX63',
 '1HYM42YsijT5tqgJO3PKsn',
 '7h3mCPlHYCPuSG93s7M3Xt',
 '2B4u4howohJ8kbkWcbuAUM',
 '5pdmo1nRKvuStKgtUXgJm5',
 '7JqqIi2ktnsNkYdQhKSCrz']

In [None]:
pl = sp.user_playlist_create(user='niniet1998', 
                             name='Spotipy Recommender Playlist', 
                             description="Playlist created with the recommendation system")

sp.playlist_add_items(pl['id'],ids_playlist_dep)

{'snapshot_id': 'MyxiODE1MjI3OWMzNjdiYzFmMGNmYTJjYjFkYjFmZGE3NjNkZTM1ZmQ3'}