# Predictions for Spotify

## Extraemos la data

In [82]:
import pandas as pd

In [83]:
# Read in the data
userinput_df = pd.read_csv('data/input.csv', index_col=0)
candidatos_df = pd.read_csv('data/user_candidatos.csv', index_col=0)

In [84]:
print(userinput_df.shape)
userinput_df.head()

(6, 13)


Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
Bye Bye,2FSYfKAApea3U42phhBOIc,0.00168,0.634,349507,0.84,0.0237,6,0.207,-5.933,1,0.0351,97.998,0.393
La Pachanga,1x6bkwDyWIOZNFY5blRSs1,0.00244,0.681,281933,0.918,0.000535,9,0.119,-4.83,0,0.0508,105.031,0.551
Auto Rojo,5PSCWHpXi8I45NXURHyhBA,0.158,0.764,306000,0.82,0.0,1,0.0939,-6.073,1,0.0441,104.527,0.688
Mojada,5N5sbaoN8UvXw8ngNR9iUk,0.328,0.666,367960,0.859,0.0,7,0.322,-5.517,1,0.0427,120.701,0.198
Fondo Profundo,2g229Q3Relqxkj5CDnTjeE,0.0673,0.819,224973,0.954,5e-05,0,0.0784,-6.057,1,0.0579,107.98,0.559


In [85]:
print(candidatos_df.shape)
candidatos_df.head()

(153, 13)


Unnamed: 0,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
Estoy a la Puerta y Llamo,4bjsp9L4sWREYUkZU8nc6e,0.935,0.493,198240,0.112,0.000558,10,0.163,-14.419,0,0.0345,76.48,0.243
Nada Te Turbe,7GPBx7jwkMSnlUS6Qqudlp,0.859,0.523,216054,0.242,7.2e-05,0,0.105,-9.303,1,0.0269,103.928,0.319
Entraré,1koXbetRP1PgmDRqU4DFPb,0.944,0.426,262373,0.0559,0.0,7,0.0633,-18.357,1,0.03,71.843,0.116
Someone You Loved,7qEHsqek33rTcFNT9PFqLf,0.751,0.501,182161,0.405,0.0,1,0.105,-5.679,1,0.0319,109.891,0.446
Before You Go,2gMXnyrvIjhVBUZwvLZDMP,0.604,0.459,215107,0.575,0.0,3,0.0885,-4.858,1,0.0573,111.881,0.183


## Sistema de recomendación

![el sistema de recomendación](./sistema_recomendacion.png)

Veamos en detalle el sistema de recomendación:

![el sistema de recomendación en detalle](./sistema_recomendacion_detalle.png)


#### El filtrado basado en contenido

Permite cuantificar qué tan similar es un ítem de `candidatos_df` a un ítem de `top20_df`.

Una forma de hacer esta comparación es usando la similitud del coseno:

![vectores de características](./vectores_caracteristicas.png)

![la similitud del coseno](./similitud_coseno.png)

Calcularemos este similitud entre cada user_input y cada una de las pistas candidatas (matriz de 20 x n_pistas_candidatas)

In [86]:
# Extraer sólo los features en formato numpy array
userinput_mtx = userinput_df.iloc[:,1:].values  # "1:" solo toma variables numéricas
candidatos_mtx = candidatos_df.iloc[:,1:].values

In [87]:
candidatos_mtx

array([[9.35000e-01, 4.93000e-01, 1.98240e+05, ..., 3.45000e-02,
        7.64800e+01, 2.43000e-01],
       [8.59000e-01, 5.23000e-01, 2.16054e+05, ..., 2.69000e-02,
        1.03928e+02, 3.19000e-01],
       [9.44000e-01, 4.26000e-01, 2.62373e+05, ..., 3.00000e-02,
        7.18430e+01, 1.16000e-01],
       ...,
       [6.41000e-01, 2.76000e-01, 2.37467e+05, ..., 3.77000e-02,
        7.79860e+01, 2.85000e-01],
       [5.68000e-02, 5.50000e-01, 1.94573e+05, ..., 4.50000e-02,
        1.07069e+02, 5.47000e-01],
       [4.38000e-02, 6.84000e-01, 2.71707e+05, ..., 4.60000e-02,
        9.99180e+01, 6.78000e-01]])

In [88]:
from sklearn.preprocessing import StandardScaler

# Estandarizar cada columna de features: mu = 0, sigma = 1
# pues cada característica tiene una escala diferente
scaler = StandardScaler()
user_scaled = scaler.fit_transform(userinput_mtx)
cand_scaled = scaler.fit_transform(candidatos_mtx)

In [89]:
print(user_scaled.mean(axis=0))
print(user_scaled.std(axis=0))

[ 3.70074342e-17  1.48029737e-15 -7.40148683e-17 -1.99840144e-15
 -1.38777878e-17 -7.40148683e-17 -2.40548322e-16 -1.62832710e-15
  7.40148683e-17  1.66533454e-16  3.70074342e-16  3.70074342e-17]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [90]:
import numpy as np

# Normalizar cada vector de características (es decir por filas)

# Magnitudes de cada vector (o pista)
user_norm = np.sqrt((user_scaled*user_scaled).sum(axis=1))
cand_norm = np.sqrt((cand_scaled*cand_scaled).sum(axis=1))

# Normalización
nuser = user_scaled.shape[0]
ncand = cand_scaled.shape[0]
user = user_scaled/user_norm.reshape(nuser,1)
cand = cand_scaled/cand_norm.reshape(ncand,1)

print(np.sqrt((user*user).sum(axis=1)))
print(np.sqrt((cand*cand).sum(axis=1)))


[1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [91]:
from sklearn.metrics.pairwise import linear_kernel

# Calcular similitudes del coseno entre cada top-20 y cada
# una de las pistas candidatas
cos_sim = linear_kernel(user,cand)
cos_sim.shape

(6, 153)

In [92]:
# Ejemplo: ¿qué tanto se parece una pista  user-input a una candidata?
print(cos_sim[5,120])   
print(cos_sim[3,24])

0.3207066778793185
-0.4542341494079558


In [93]:
# Obtener candidatos para una pista dada

# Dada una pista del user-input (pos = 0, 1, ..., 6) extraer "ncands" candidatos, usando
# "cos_sim" y siempre y cuando superen un umbral de similitud

def obtener_candidatos(pos, cos_sim, ncands, umbral = 0.8):
    # Obtener todas las pistas candidatas por encima de umbral
    idx = np.where(cos_sim[pos,:]>=umbral)[0] # ejm. idx: [27, 82, 135]
    
    # Y organizarlas de forma descendente (por similitudes de mayor a menor)
    idx = idx[np.argsort(cos_sim[pos,idx])[::-1]] # [::-1] porque por defecto argsort organiza de manera ascendente

    # Si hay más de "ncands", retornar únicamente un total de "ncands"
    if len(idx) >= ncands:
        cands = idx[0:ncands]
    else:
        cands = idx
  
    return cands

In [94]:
# Ejemplo de uso
# Todo: Cuantas canciones necesitará el usuario
# para que el sistema le recomiende una canción
# que le guste?

for i in range(6):
    cands = obtener_candidatos(i, cos_sim, 5, 0.6)
    print(f'{i} ==> pistas candidatas: {cands}, similitudes: {cos_sim[i,cands]}')

0 ==> pistas candidatas: [122], similitudes: [0.65254885]
1 ==> pistas candidatas: [ 95  85  97  45 119], similitudes: [0.77909773 0.70347501 0.6554713  0.64904769 0.64530965]
2 ==> pistas candidatas: [ 82  30  49 145  12], similitudes: [0.79828689 0.77597178 0.72431469 0.63320549 0.62669585]
3 ==> pistas candidatas: [104  88], similitudes: [0.73139861 0.61415946]
4 ==> pistas candidatas: [ 73  54 123  92], similitudes: [0.72915057 0.61911345 0.61633274 0.6038117 ]
5 ==> pistas candidatas: [67], similitudes: [0.79351395]


In [95]:
# Para crear la playlist se requieren únicamente los ids
ids_user = []
ids_playlist = []

for i in range(userinput_df.shape[0]):
    print(userinput_df.index[i])   # Nombre de la pista en el top-20
    ids_user.append(userinput_df['id'][i])
    
    # Obtener listado de candidatos para esta pista
    cands = obtener_candidatos(i, cos_sim, 5, umbral=0.6)
    
    # Si hay pistas relacionadas obtener los ids correspondientes
    # e imprimir en pantalla
    if len(cands)==0:
        print('     ***No se encontraron pistas relacionadas***')
    else:
        # Obtener los ids correspondientes e imprimir en pantalla
        for j in cands:
            id_cand = candidatos_df['id'][j]
            ids_playlist.append(id_cand)
            
            # E imprimir en pantalla el candidato
            print(f'   {candidatos_df.index[j]}')

Bye Bye
   Ojos Claros, Labios Rosas
La Pachanga
   Marta tiene un marcapasos
   La Cintura
   La Pachanga
   Andas En Mi Cabeza
   The Thrill
Auto Rojo
   Propuesta Indecente
   Mor
   Lo Que Construimos
   Luz De Dia
   It Ain't Me (with Selena Gomez)
Mojada
   La parte de adelante
   Lobo Hombre en París - Versión Sinfónico [En Vivo]
Fondo Profundo
   Sweet & Sour (feat. Lauv & Tyga)
   Roast Yourself Challenge
   La Botella
   Despertar
Oye Mi Amor
   Oye Mi Amor


In [96]:
# Eliminar candidatos que ya están en el top-20
ids_playlist_dep = [x for x in ids_playlist if x not in ids_user]

# Y eliminar posibles repeticiones
ids_playlist_dep = list(set(ids_playlist_dep))
ids_user = list(set(ids_user))

In [97]:
ids_user.extend(ids_playlist_dep)

In [98]:
ids_user

['2g229Q3Relqxkj5CDnTjeE',
 '2FSYfKAApea3U42phhBOIc',
 '5N5sbaoN8UvXw8ngNR9iUk',
 '5EJ2THuhAapEIeQOtXUQ0x',
 '5PSCWHpXi8I45NXURHyhBA',
 '1x6bkwDyWIOZNFY5blRSs1',
 '56oGoEjA9eTZYgsttEFKY3',
 '6VUJQqyeQhAZHkQbSDqLtI',
 '0fHTVPRRKmWaRuIwrsNTMp',
 '5wWxUdgn2OugIOvLJOdreH',
 '23WI5V2eD4EyGKxSl7Pyeq',
 '3eR23VReFzcdmS7TYCrhCe',
 '1mlnmVqp3mnCIFwwk7uH8C',
 '1XvfncS1t4BNkh37klHeqj',
 '5PycBIeabfvX3n9ILG7Vrv',
 '0HGoUFjnLX2YuFXxcopZdR',
 '29GrGzUHri2cpt1v2TTimI',
 '3dtGHZyEGmKSmQoAPcbvyb',
 '3ZWhRS2EJVL5B0IEzkcvZa',
 '5VZHMNKKYRul576P9Hqj6d',
 '5NoE12bRYiehlnDpOcI7A6',
 '6TqXieeBcZZHyaO14hQpKx']

## Crear la playlist en spotipy

In [99]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth

scope = "playlist-modify-private"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [100]:
# Crear la playlist en spotify
me = sp.me()
username = me['id']

pl = sp.user_playlist_create(user=username, 
                             name='DJ Spotify',
                             description='Playlist creada por el DJ Spotify',
                             public=False)

In [101]:
sp.playlist_add_items(pl['id'], ids_user)

{'snapshot_id': 'Miw2YjIxNzI2ZWZlZTBjNGI1NTZjYzFhOTNkMzVkMjg1NDU3ZjMyNjNm'}