Each song stored in Spotify has some information builded by Spotify to classify the music. This script is going to download all the relevant information associated with the sound of the songs.

In [1]:
from secrets import * # Import the ClientID and clientSecret
import os
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials 
from datetime import date

In [2]:
path_scritps = os.getcwd()
path_data = path_scritps.replace("Scripts", "Data")

In [3]:
os.listdir(path_data)

['consolidate_raw_2021-09-19.csv',
 'consolidate_raw_2021_37.csv',
 'consolidate_raw_audio_features_2021_37.csv',
 'consolidate_raw_audio_features_2021_37.tex',
 'novedades_viernes_2021-09-19.csv',
 'novedades_viernes_2021_33.csv',
 'novedades_viernes_2021_34.csv',
 'novedades_viernes_2021_36.csv',
 'novedades_viernes_2021_37.csv',
 'popeton_2021-09-19.csv',
 'popeton_2021_33.csv',
 'popeton_2021_34.csv',
 'popeton_2021_36.csv',
 'popeton_2021_37.csv',
 'songs',
 'top_colombia_2021-09-19.csv',
 'top_colombia_2021_33.csv',
 'top_colombia_2021_34.csv',
 'top_colombia_2021_36.csv',
 'top_colombia_2021_37.csv',
 'viral_2021-09-19.csv',
 'viral_2021_33.csv',
 'viral_2021_34.csv',
 'viral_2021_36.csv',
 'viral_2021_37.csv']

In [4]:
# Read the most recent file
data = pd.read_csv(path_data + "/consolidate_raw_2021-09-19.csv", sep = ";", index_col = 0)

In [6]:
client_credentials_manager = SpotifyClientCredentials(client_id = clientId, client_secret = clientSecret)
# Spotify object to access API
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager) 

In [7]:
# Extracts the song analysis performed by Spotify
# This process could be optimize if only search the information of the songs that are not 
# searched before. Here we are repeating the extraction for songs that we already had. 
songs = data["song_id"].drop_duplicates().values
features = pd.DataFrame()
for song in songs:
    row = sp.audio_features(song)
    row = pd.DataFrame.from_dict(row)
    features = features.append(row).reset_index(drop = True)

In [8]:
# Remove variables that are not relevant
features = features.drop(columns = ["type", "duration_ms"])

In [9]:
# Rename columns for the join
features = features.rename(columns = {"id": "song_id"})

In [10]:
# Append the new variables to our base
data = data.merge(features, how = "left", on = "song_id")

In [11]:
# Save results
today = str(date.today())

In [12]:
data.to_csv(path_data + "/consolidate_raw_audio_features_" + today + ".csv", sep = ";")

In [14]:
data = pd.read_csv(path_data + "/consolidate_raw_audio_features_" + 
    '2021-09-19' + ".csv", sep = ";", index_col = 0)

In [16]:
# Create descriptive statistics of the new variables
stat = data.drop_duplicates("song_id").reset_index(drop = True).describe().T

In [17]:
stat.columns = ["N", "Mean", "SD", "Min", "P(25)", "P50", "P(75)", "Max"]

def f1(x):
    return '%1.0f' % x

def f2(x):
    return '%1.2f' % x

In [18]:
stat.to_latex(    
    path_data + "/consolidate_raw_audio_features_" + "2021" + "_" + "37" + ".tex",
    longtable = True, 
    formatters = [f1, f2, f2, f2, f2, f2, f2, f2]
)