# Pré-processar os dados do Spotify e extrair as características

Esse notebook tem o intuito de fazer o pré-processamento dos dados e também extrair as `features` para criação da nossa base de recomendações.

### Bibliotecas

- tqdm
- spotipy
- python-dotenv
- pandas
- multiprocessing

### Instalação e importação de dependências

In [1]:
pip install tqdm spotipy python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
import spotipy
import multiprocessing
import concurrent.futures
from tqdm import tqdm
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv

load_dotenv()


True

In [3]:
client_id = os.getenv("SPOTIFY_CLIENT_ID")
client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


### Importação dos dados

Aqui utilizarems o arquivo `to_process.csv` gerado no notebook `01-get-data-from-spotify.ipynb` para fazer o pré-processamento e extração das características.

In [4]:
file_path = "../data/to_process.csv"
fields = ["artist_name", "track_name",
          "track_url", "artist_url", "track_popularity"]

df = pd.read_csv(file_path, usecols=fields)
df.head()


Unnamed: 0,artist_name,artist_url,track_name,track_url,track_popularity
0,AgroPlay,0uGWkZRPp1Urk77XBrPBsZ,Nosso Quadro,4uqJelb9THHmJ3OCohg4ZJ,82
1,Marília Mendonça,1yR65psqiazQpeM79CcGh8,Leão,2K9kZpwD2CzTa6iiSYYOoO,89
2,Simone Mendes,2eK9gcJQ6uqVvJL63dnOM3,Erro Gostoso - Ao Vivo,51pxwIxDVCbFHW8oyJFQXD,87
3,Israel & Rodolffo,41QLxRXlc2NwfJZkHGHKid,Seu Brilho Sumiu - Ao Vivo,3PH1nUysW7ybo3Yu8sqlPN,83
4,Zé Neto & Cristiano,487N2T9nIPEHrlTZLL3SQs,Oi Balde - Ao Vivo,3mzbab4H0iZYkHggSEj0UX,87


### Buscando e incluindo as `features`

Nesta etapa é montada as funções que utilizaremos para extrais as características das músicas através da API do Spotify.

In [5]:
def merge_track_artists_ids(df):
    df["track_artist_id"] = df.apply(lambda row: str(
        row["track_url"]) + "_" + str(row["artist_url"]), axis=1)
    return df


def process_tracks(ids):
    tracks_ids = []
    artists_ids = []

    for id in ids:
        [track_id, artist_id] = id.split("_")
        tracks_ids.append(track_id)
        artists_ids.append(artist_id)

    try:
        # Buscar as features
        audio_features = sp.audio_features(tracks_ids)

        # Buscar dados do artista
        artists = []
        for i in range(0, len(artists_ids), 50):
            artist_batch = artists_ids[i:i+50]
            artists += sp.artists(artist_batch)["artists"]

        tracks = []
        for i in range(len(tracks_ids)):
            track = audio_features[i]
            track.update({
                "artist_popularity": artists[i]["popularity"],
                "artist_genres": artists[i]["genres"]
            })
            tracks.append(track)

        return tracks
    except:
        return []


In [6]:
df = merge_track_artists_ids(df)
process_tracks([df["track_artist_id"][0]])


[{'danceability': 0.691,
  'energy': 0.708,
  'key': 9,
  'loudness': -4.821,
  'mode': 1,
  'speechiness': 0.195,
  'acousticness': 0.338,
  'instrumentalness': 0,
  'liveness': 0.311,
  'valence': 0.602,
  'tempo': 160.018,
  'type': 'audio_features',
  'id': '4uqJelb9THHmJ3OCohg4ZJ',
  'uri': 'spotify:track:4uqJelb9THHmJ3OCohg4ZJ',
  'track_href': 'https://api.spotify.com/v1/tracks/4uqJelb9THHmJ3OCohg4ZJ',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4uqJelb9THHmJ3OCohg4ZJ',
  'duration_ms': 173698,
  'time_signature': 4,
  'artist_popularity': 73,
  'artist_genres': ['agronejo']}]

Para buscar as características iremos utilizar o `ThreadPoolExecutor` para paralelizar  as chamadas de função, visto que em alguns momentos podemos ter um fluxo alto de dados para processar.

Para saber a quantidade máxima de workers que podemos usar, contamos quantas threads temos disponíveis usando o `multiprocessing`.

In [7]:
cpu_count = multiprocessing.cpu_count()

max_workers = cpu_count if cpu_count < 12 else 12
[cpu_count, max_workers]


[24, 12]

In [8]:
def chunks(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i:i + chunk_size]


In [9]:
unique_ids = df["track_artist_id"].unique()

# Usando o ThreadPoolExecutioner para processar as músicas paralelamente
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Criar um executor com a funcão process_tracks para cada chunk de músicas
    future_results = [executor.submit(process_tracks, chunk)
                      for chunk in chunks(unique_ids, 100)]

    # Usar o tqdm para acompanhar o progresso dos executores
    features = []
    for future in tqdm(concurrent.futures.as_completed(future_results), total=len(future_results)):
        # Pegar os resultados e mesclar eles
        feature = future.result()
        features.extend(feature)


100%|██████████| 1/1 [00:00<00:00,  2.07it/s]


In [10]:
features_df = pd.DataFrame(features)
features_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,artist_popularity,artist_genres
0,0.691,0.708,9,-4.821,1,0.195,0.338,0.0,0.311,0.602,160.018,audio_features,4uqJelb9THHmJ3OCohg4ZJ,spotify:track:4uqJelb9THHmJ3OCohg4ZJ,https://api.spotify.com/v1/tracks/4uqJelb9THHm...,https://api.spotify.com/v1/audio-analysis/4uqJ...,173698,4,73,[agronejo]
1,0.743,0.865,6,-3.897,1,0.0315,0.449,3e-06,0.297,0.794,129.999,audio_features,2K9kZpwD2CzTa6iiSYYOoO,spotify:track:2K9kZpwD2CzTa6iiSYYOoO,https://api.spotify.com/v1/tracks/2K9kZpwD2CzT...,https://api.spotify.com/v1/audio-analysis/2K9k...,166775,4,80,"[arrocha, sertanejo, sertanejo universitario]"
2,0.588,0.893,6,-2.743,1,0.0866,0.176,0.0,0.803,0.626,153.778,audio_features,51pxwIxDVCbFHW8oyJFQXD,spotify:track:51pxwIxDVCbFHW8oyJFQXD,https://api.spotify.com/v1/tracks/51pxwIxDVCbF...,https://api.spotify.com/v1/audio-analysis/51px...,178812,4,73,[sertanejo]
3,0.625,0.916,6,-3.704,1,0.0461,0.309,0.0,0.906,0.746,153.664,audio_features,3PH1nUysW7ybo3Yu8sqlPN,spotify:track:3PH1nUysW7ybo3Yu8sqlPN,https://api.spotify.com/v1/tracks/3PH1nUysW7yb...,https://api.spotify.com/v1/audio-analysis/3PH1...,168840,4,76,"[agronejo, arrocha, sertanejo, sertanejo unive..."
4,0.67,0.668,2,-6.411,1,0.0459,0.601,0.0,0.803,0.552,107.995,audio_features,3mzbab4H0iZYkHggSEj0UX,spotify:track:3mzbab4H0iZYkHggSEj0UX,https://api.spotify.com/v1/tracks/3mzbab4H0iZY...,https://api.spotify.com/v1/audio-analysis/3mzb...,159158,4,78,"[agronejo, arrocha, sertanejo, sertanejo unive..."


### Finalização e exportação

Agora mesclamos os dataframes, excluimos as duplicatas e salvamos em um `.csv`

In [11]:
df.drop(columns=["track_artist_id"], inplace=True)

completed_df = pd.merge(df, features_df, left_on="track_url", right_on="id")
completed_df.head()

Unnamed: 0,artist_name,artist_url,track_name,track_url,track_popularity,danceability,energy,key,loudness,mode,...,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,artist_popularity,artist_genres
0,AgroPlay,0uGWkZRPp1Urk77XBrPBsZ,Nosso Quadro,4uqJelb9THHmJ3OCohg4ZJ,82,0.691,0.708,9,-4.821,1,...,160.018,audio_features,4uqJelb9THHmJ3OCohg4ZJ,spotify:track:4uqJelb9THHmJ3OCohg4ZJ,https://api.spotify.com/v1/tracks/4uqJelb9THHm...,https://api.spotify.com/v1/audio-analysis/4uqJ...,173698,4,73,[agronejo]
1,Marília Mendonça,1yR65psqiazQpeM79CcGh8,Leão,2K9kZpwD2CzTa6iiSYYOoO,89,0.743,0.865,6,-3.897,1,...,129.999,audio_features,2K9kZpwD2CzTa6iiSYYOoO,spotify:track:2K9kZpwD2CzTa6iiSYYOoO,https://api.spotify.com/v1/tracks/2K9kZpwD2CzT...,https://api.spotify.com/v1/audio-analysis/2K9k...,166775,4,80,"[arrocha, sertanejo, sertanejo universitario]"
2,Simone Mendes,2eK9gcJQ6uqVvJL63dnOM3,Erro Gostoso - Ao Vivo,51pxwIxDVCbFHW8oyJFQXD,87,0.588,0.893,6,-2.743,1,...,153.778,audio_features,51pxwIxDVCbFHW8oyJFQXD,spotify:track:51pxwIxDVCbFHW8oyJFQXD,https://api.spotify.com/v1/tracks/51pxwIxDVCbF...,https://api.spotify.com/v1/audio-analysis/51px...,178812,4,73,[sertanejo]
3,Israel & Rodolffo,41QLxRXlc2NwfJZkHGHKid,Seu Brilho Sumiu - Ao Vivo,3PH1nUysW7ybo3Yu8sqlPN,83,0.625,0.916,6,-3.704,1,...,153.664,audio_features,3PH1nUysW7ybo3Yu8sqlPN,spotify:track:3PH1nUysW7ybo3Yu8sqlPN,https://api.spotify.com/v1/tracks/3PH1nUysW7yb...,https://api.spotify.com/v1/audio-analysis/3PH1...,168840,4,76,"[agronejo, arrocha, sertanejo, sertanejo unive..."
4,Zé Neto & Cristiano,487N2T9nIPEHrlTZLL3SQs,Oi Balde - Ao Vivo,3mzbab4H0iZYkHggSEj0UX,87,0.67,0.668,2,-6.411,1,...,107.995,audio_features,3mzbab4H0iZYkHggSEj0UX,spotify:track:3mzbab4H0iZYkHggSEj0UX,https://api.spotify.com/v1/tracks/3mzbab4H0iZY...,https://api.spotify.com/v1/audio-analysis/3mzb...,159158,4,78,"[agronejo, arrocha, sertanejo, sertanejo unive..."


In [12]:
def drop_duplicates(df):
    df["artists_song"] = df.apply(lambda row: str(
        row["artist_name"]) + str(row["track_name"]), axis=1)
    df = df.drop_duplicates("artists_song")
    print("Are all songs unique:", len(pd.unique(df.artists_song)) == len(df))
    df = df.drop(columns=["artists_song"], inplace=True)
    return df


completed_df = drop_duplicates(completed_df)


Are all songs unique: True


In [None]:
# Para um novo processed_data.csv
df.to_csv("processed_data.csv", index=False)
# Para mesclar em um processed_data.csv já existente
# df.to_csv("processed_data.csv", mode="a", index=False, header=False)
