#### 1. Importamos las librerías necesarias.

In [1]:
import sys
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pyjsonviewer

import os
from dotenv import load_dotenv
load_dotenv()

True

#### 2. Creamos nuestras credenciales para acceder a nuestra lista de reproducción de Spotify.

In [2]:
# hemos dejado nuestros datos en .env, cifrado con .gitignore
client_ID = os.getenv('Client_ID')
client_secret = os.getenv('Client_Secret')

In [3]:
credenciales = SpotifyClientCredentials(client_id=client_ID, client_secret=client_secret)

In [4]:
credenciales

<spotipy.oauth2.SpotifyClientCredentials at 0x7f78988e5190>

In [5]:
sp = spotipy.Spotify(client_credentials_manager=credenciales)

#### 3. Utilizamos el link de la lista para conseguir su identificador.

In [6]:
link = "https://open.spotify.com/playlist/4PQ9xWUanbBuKwFFTc3482?si=265950455dd74ea8"

# utilizamos el metodo string para quedarnos sólo con el identificador de la lista de reproducción
iden = link.split("/")[-1].split("?")[0]
iden

'4PQ9xWUanbBuKwFFTc3482'

#### 4. Llamamos a la API para conseguir todos los datos de nuestras canciones.

In [7]:
# Spotify solo nos devuelve 100 canciones con cada llamada, por lo que creamos una función que nos guarde en una variable toda nuestra data.
offset = 0
all_data = []
for i in range(8):
    all_data.append(sp.playlist_tracks(iden, offset=offset))
    offset += 100

In [8]:
# Visualizamos todos los diccionarios en una ventana externa para mejorar su legibilidad.
pyjsonviewer.view_data(json_data=all_data)

In [9]:
all_data[1]["items"][60].keys()

dict_keys(['added_at', 'added_by', 'is_local', 'primary_color', 'track', 'video_thumbnail'])

#### 5. Comprobamos el funcionamiento de las listas de diccionarios para buscar información.

In [10]:
# cuando se agregó la canción
all_data[1]["items"][9]['added_at']

'2022-11-22T14:26:51Z'

In [11]:
# qué usuario la agregó
all_data[1]["items"][9]['added_by']['id']

'31p3oc6g3fcxi2hep3pg6e25aifm'

In [12]:
all_data[1]["items"][9]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [13]:
# nombre la canción
all_data[1]["items"][9]['track']['name']

'Every Breath You Take'

In [14]:
# nombre del artista
all_data[1]["items"][9]['track']['artists'][0]['name']

'The Police'

In [15]:
all_data[1]["items"][9]['track']['album'].keys()

dict_keys(['album_type', 'artists', 'available_markets', 'external_urls', 'href', 'id', 'images', 'name', 'release_date', 'release_date_precision', 'total_tracks', 'type', 'uri'])

In [16]:
# album de la canción
all_data[1]["items"][9]['track']['album']['name']

'Synchronicity (Remastered 2003)'

In [17]:
# cuando se estrenó la canción
all_data[1]["items"][9]['track']['album']['release_date']

'1983-06-17'

In [18]:
# portada del album
all_data[1]["items"][9]['track']['album']['images'][0]['url']

'https://i.scdn.co/image/ab67616d0000b273c8e97cafeb2acb85b21a777e'

In [19]:
# popularidad de la canción
all_data[1]["items"][9]['track']['popularity']

87

In [20]:
# duración
all_data[1]["items"][9]['track']['duration_ms']

253920

In [21]:
# explícita
all_data[1]["items"][9]['track']['explicit']

False

In [22]:
# identificador de la canción
all_data[1]["items"][9]['track']['uri']

'spotify:track:1JSTJqkT5qHq8MDJnJbRE1'

#### 6. Creamos una función para iterar por toda la lista de reproducción.

In [23]:
basic_info = {"song": [], 
            "artist": [], 
            "album": [], 
            "duration": [],
            "date_add": [],
            "date_album": [], 
            "explicit": [], 
            "popularity": [], 
            "user": [], 
            "image": [],  
            "uri": [] }

for diccionario in range(len(all_data)):
    for cancion in range(len(all_data[diccionario]["items"])):
        basic_info["song"].append(all_data[diccionario]["items"][cancion]["track"]["name"])
        basic_info["album"].append(all_data[diccionario]["items"][cancion]["track"]["album"]["name"])
        basic_info["duration"].append(all_data[diccionario]["items"][cancion]["track"]["duration_ms"])
        basic_info["date_add"].append(all_data[diccionario]["items"][cancion]["added_at"])
        basic_info["date_album"].append(all_data[diccionario]["items"][cancion]["track"]["album"]["release_date"])
        basic_info["explicit"].append(all_data[diccionario]["items"][cancion]["track"]["explicit"])
        basic_info["popularity"].append(all_data[diccionario]["items"][cancion]["track"]["popularity"])
        basic_info["user"].append(all_data[diccionario]["items"][cancion]["added_by"]["id"])
        basic_info["image"].append(all_data[diccionario]["items"][cancion]["track"]["album"]["images"][0]["url"])
        basic_info["uri"].append(all_data[diccionario]["items"][cancion]["track"]["uri"])


        numero_artistas = all_data[diccionario]["items"][cancion]["track"]["artists"]

        if len(numero_artistas) == 1:
            basic_info["artist"].append(all_data[diccionario]["items"][cancion]["track"]["artists"][0]["name"])
        else:
            lista_artista = []
            for artista in range(len(numero_artistas)):
                lista_artista.append(all_data[diccionario]["items"][cancion]["track"]["artists"][artista]["name"])
            basic_info["artist"].append(lista_artista)

In [24]:
basic_info['song'][2]

"Hard to Say I'm Sorry"

In [25]:
basic_info['artist'][2]

'Chicago'

In [26]:
basic_info['user'][2]

'laurezurich'

#### 7. Pasamos la lista de diccionarios a DataFrame para trabajar con él.

In [27]:
# comprobamos que todas las columnas tienen la misma medida
for k,v in basic_info.items():
    print(k, "--", len(v))

song -- 680
artist -- 680
album -- 680
duration -- 680
date_add -- 680
date_album -- 680
explicit -- 680
popularity -- 680
user -- 680
image -- 680
uri -- 680


In [28]:
# convertimos nuestros datos a DataFrame
df = pd.DataFrame(basic_info)
df.head()

Unnamed: 0,song,artist,album,duration,date_add,date_album,explicit,popularity,user,image,uri
0,Lobby Music (Original Soundtrack),Kahoot!,Lobby Music (Original Soundtrack),144008,2022-11-15T12:23:07Z,2013-03-05,False,53,31grj66zzxhioxdjn3nv22jvfpji,https://i.scdn.co/image/ab67616d0000b27363cd4c...,spotify:track:7IEdlE4ZwzPDxnoWFv10aj
1,No te olvides de poner el Where en el Delete From,Jorge Rubira,No te olvides de poner el Where en el Delete From,222057,2023-01-12T07:50:25Z,2021-03-14,False,8,31tlrqq55e5zdgosw6swshtio5za,https://i.scdn.co/image/ab67616d0000b273afceff...,spotify:track:7j4N1Jgf5ee7zLHtjcKpiB
2,Hard to Say I'm Sorry,Chicago,The Collection,232320,2022-11-07T10:05:48Z,2012-10-29,False,65,laurezurich,https://i.scdn.co/image/ab67616d0000b27372e167...,spotify:track:58DXqHKK7TfxUmhCq2ux4E
3,Gold on the Ceiling,The Black Keys,El Camino,224333,2022-11-07T10:07:03Z,2011-12-06,False,71,laurezurich,https://i.scdn.co/image/ab67616d0000b2736a21b9...,spotify:track:5lN1EH25gdiqT1SFALMAq1
4,Vienna,Ultravox,This Is... 1981,278226,2022-11-07T10:08:04Z,2008-08-08,False,0,laurezurich,https://i.scdn.co/image/ab67616d0000b27337aac7...,spotify:track:0cqVKNj1BubZHzp0OiI5IK


#### 8. Añadimos aspectos técnicos a nuestro DataFrame, para realizar un análisis más exhaustivo.

In [29]:
# creamos una lista de valores únicos de los identificadores de las canciones
uris = list(df["uri"].unique())

In [31]:
# creamos una función para recuperar esas nuevos aspectos
features = []
for cancion in uris: 
    features.append(sp.audio_features(cancion))

In [32]:
# creamos un DataFrame con una columna dónde se encuentran todas las características en un diccionario
df_features = pd.DataFrame(features)
df_features.columns = ["caracteristicas"]
df_features.head()

Unnamed: 0,caracteristicas
0,"{'danceability': 0.818, 'energy': 0.693, 'key'..."
1,"{'danceability': 0.697, 'energy': 0.924, 'key'..."
2,"{'danceability': 0.543, 'energy': 0.381, 'key'..."
3,"{'danceability': 0.505, 'energy': 0.833, 'key'..."
4,"{'danceability': 0.583, 'energy': 0.614, 'key'..."


In [33]:
# aplicamos el método pd.Series para desempaquetar el diccionario y crear nuevas columnas con cada característica
df_features = df_features["caracteristicas"].apply(pd.Series)
df_features.head(2)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.818,0.693,0,-15.818,0,0.117,0.0244,0.89,0.0925,0.968,119.968,audio_features,7IEdlE4ZwzPDxnoWFv10aj,spotify:track:7IEdlE4ZwzPDxnoWFv10aj,https://api.spotify.com/v1/tracks/7IEdlE4ZwzPD...,https://api.spotify.com/v1/audio-analysis/7IEd...,144009,4
1,0.697,0.924,0,-3.13,1,0.124,0.223,0.0,0.345,0.56,124.786,audio_features,7j4N1Jgf5ee7zLHtjcKpiB,spotify:track:7j4N1Jgf5ee7zLHtjcKpiB,https://api.spotify.com/v1/tracks/7j4N1Jgf5ee7...,https://api.spotify.com/v1/audio-analysis/7j4N...,222058,4


In [34]:
# utilizamos el método merge para unir ambos DataFrames
final = pd.merge(df, df_features, on = "uri", how = "left")
final.head()

Unnamed: 0,song,artist,album,duration,date_add,date_album,explicit,popularity,user,image,...,instrumentalness,liveness,valence,tempo,type,id,track_href,analysis_url,duration_ms,time_signature
0,Lobby Music (Original Soundtrack),Kahoot!,Lobby Music (Original Soundtrack),144008,2022-11-15T12:23:07Z,2013-03-05,False,53,31grj66zzxhioxdjn3nv22jvfpji,https://i.scdn.co/image/ab67616d0000b27363cd4c...,...,0.89,0.0925,0.968,119.968,audio_features,7IEdlE4ZwzPDxnoWFv10aj,https://api.spotify.com/v1/tracks/7IEdlE4ZwzPD...,https://api.spotify.com/v1/audio-analysis/7IEd...,144009,4
1,No te olvides de poner el Where en el Delete From,Jorge Rubira,No te olvides de poner el Where en el Delete From,222057,2023-01-12T07:50:25Z,2021-03-14,False,8,31tlrqq55e5zdgosw6swshtio5za,https://i.scdn.co/image/ab67616d0000b273afceff...,...,0.0,0.345,0.56,124.786,audio_features,7j4N1Jgf5ee7zLHtjcKpiB,https://api.spotify.com/v1/tracks/7j4N1Jgf5ee7...,https://api.spotify.com/v1/audio-analysis/7j4N...,222058,4
2,Hard to Say I'm Sorry,Chicago,The Collection,232320,2022-11-07T10:05:48Z,2012-10-29,False,65,laurezurich,https://i.scdn.co/image/ab67616d0000b27372e167...,...,4e-06,0.249,0.188,72.332,audio_features,58DXqHKK7TfxUmhCq2ux4E,https://api.spotify.com/v1/tracks/58DXqHKK7Tfx...,https://api.spotify.com/v1/audio-analysis/58DX...,232320,4
3,Gold on the Ceiling,The Black Keys,El Camino,224333,2022-11-07T10:07:03Z,2011-12-06,False,71,laurezurich,https://i.scdn.co/image/ab67616d0000b2736a21b9...,...,7.9e-05,0.069,0.565,130.121,audio_features,5lN1EH25gdiqT1SFALMAq1,https://api.spotify.com/v1/tracks/5lN1EH25gdiq...,https://api.spotify.com/v1/audio-analysis/5lN1...,224333,4
4,Vienna,Ultravox,This Is... 1981,278226,2022-11-07T10:08:04Z,2008-08-08,False,0,laurezurich,https://i.scdn.co/image/ab67616d0000b27337aac7...,...,0.234,0.109,0.456,80.747,audio_features,0cqVKNj1BubZHzp0OiI5IK,https://api.spotify.com/v1/tracks/0cqVKNj1BubZ...,https://api.spotify.com/v1/audio-analysis/0cqV...,278227,4


In [35]:
# eliminamos las columnas que pueden entorpecer nuestro análisis exploratorio
final.drop(['type', 'id', 'track_href', 'analysis_url', 'duration_ms','time_signature'], axis = 1, inplace = True)

#### 9. Siguiendo con la limpieza, nos damos cuenta que la mayoría de los ID de los users son irreconocibles, así que modificamos los nombres.

In [36]:
# primero comprobamos los usuarios
final.user.unique()

array(['31grj66zzxhioxdjn3nv22jvfpji', '31tlrqq55e5zdgosw6swshtio5za',
       'laurezurich', '31t72cua5e6riqibjxnqescvkr4i', '11132975782',
       '7kt5yda4woutn5mez6lbe13m0', '31p3oc6g3fcxi2hep3pg6e25aifm',
       'su75lvxn1izr4doamfnwvf61v', 'silviagorvic', 'superarrocito',
       '21rpvd5svxpeaeqtmn63kqy7y', '317x7ztj5ikbtxuymvixaa32s4o4',
       '21uowk5ic3isdrshld6xjizza', '1183037273', '1120757998', 'pfuente',
       'irenepowers', '214nikwxpg7h75l7y4ghuqlhq'], dtype=object)

In [37]:
# creamos un diccionario con los user y los nombres de las usuarias, para utilizar el método map y cambiar los nombres.
dicc_users = {"31grj66zzxhioxdjn3nv22jvfpji": "Vanna", 
              "31tlrqq55e5zdgosw6swshtio5za": "Sonia", 
              "laurezurich": "Laura", 
              "31t72cua5e6riqibjxnqescvkr4i": "Noemi", 
              "11132975782": "Natalia", 
              "7kt5yda4woutn5mez6lbe13m0": "Gadea", 
              "31p3oc6g3fcxi2hep3pg6e25aifm": "Ana C.", 
              "su75lvxn1izr4doamfnwvf61v": "Sandra", 
              "silviagorvic": "Silvia", 
              "superarrocito": "Sila", 
              "21rpvd5svxpeaeqtmn63kqy7y": "Anahi", 
              "317x7ztj5ikbtxuymvixaa32s4o4": "Ximena", 
              "21uowk5ic3isdrshld6xjizza": "Mai", 
              "1183037273": "Ana GG", 
              "1120757998": "Lola", 
              "pfuente": "Paula", 
              "irenepowers": "Irene", 
              "214nikwxpg7h75l7y4ghuqlhq": "Naiara"}

final["user"] = final["user"].map(dicc_users)

#### 10. Con el DataFrame ya limpio, guardamos nuestros datos.

In [39]:
final.to_csv("../files/spotify_limpio.csv")