In [1]:
import pandas as pd
import time
import json

In [2]:
import spotipy

from spotipy.oauth2 import SpotifyClientCredentials

In [3]:
data = pd.read_csv("data/MUSIC4ALL/processed/filtered_music4all_v3.csv", index_col=0)
data.head()

Unnamed: 0,lyrics,artist,song,album_name,genres,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
12,"De tanto me perder, de andar sem sono\nPor ess...",Adriana Calcanhotto,Um Dia Desses,Maré,mpb,30Iofj66ashW2QYbROuIC2,27.0,2008,0.724,0.339,0.0,1.0,0.48,119.902,151013
16,No meio dessa confusão\nCê me deixou na mão\nC...,Um44k,Nossa música,Nossa música,"pop,rap",4hpcc2wq84pilUgDbIPkDN,62.0,2018,0.592,0.387,2.0,1.0,0.7,125.638,239643
18,Radio bemba 00.0 Super Sónica\nAtento\nO bixo ...,Manu Chao,Bixo,Próxima Estación: Esperanza,"latin,reggae,world",3JJIdDXjN8oH6ZfPv3pApB,38.0,2001,0.658,0.452,5.0,0.0,0.971,91.321,112040
19,Gostei da sua ideia\nEntrei na sua selva\nVocê...,Selvagens À Procura de Lei,Enquanto Eu Passar Na Sua Rua,Selvagens à Procura de Lei,"indie rock,rock",0BMowyr7gejwveFKRkhj0l,38.0,2013,0.526,0.771,6.0,1.0,0.681,102.007,203987
47,"Já tive mulheres (salve Martinho), de todos os...",Motirô,Senhorita,Um Passo à Frente,hip hop,5se489TZrnQLUXZq8vARkY,34.0,2006,0.846,0.591,1.0,0.0,0.829,94.034,255987


# Duplicated removal by artist and title

In [4]:
data[data.duplicated(subset=["artist", "song"])]

Unnamed: 0,lyrics,artist,song,album_name,genres,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms


# Duplicated removal by spotify_id

In [5]:
duplicated_id_mask = data.duplicated(keep=False, subset=["spotify_id"])
duplicated = data[duplicated_id_mask]
len(duplicated)

630

In [20]:
duplicated.to_csv("data/MUSIC4ALL/processed/duplicated_ids.csv", index=True)

In [6]:
unique_data = data[~duplicated_id_mask]
len(unique_data)

6398

In [18]:
unique_data.to_csv("data/MUSIC4ALL/processed/filtered_music4all_v4_part1.csv", index=True)

# Crawling (Unique Data)

In [7]:
unique_spotify_ids = unique_data.spotify_id.tolist()

In [8]:
max_tracks_per_call = 50
max_feats_per_call = 100
min_wait_time_per_call = 0.3

In [9]:
credentials = SpotifyClientCredentials()

In [10]:
sp = spotipy.Spotify(auth_manager=credentials)

In [13]:
for start in range(0, len(unique_spotify_ids), max_tracks_per_call):
    end = start + max_tracks_per_call
    track_results = sp.tracks(unique_spotify_ids[start:end])
    
    if 'tracks' in track_results:  
        with open(f'data/MUSIC4ALL/crawled/track_results_{start}_{end-1}.json', 'w') as f:
            json.dump(track_results, f)
        print(f"Sucess")
    else:
        print(f"Error for indices {start} to {end-1}")
        
    time.sleep(min_wait_time_per_call)

Sucess for indices 0 to 49
Sucess for indices 50 to 99
Sucess for indices 100 to 149
Sucess for indices 150 to 199
Sucess for indices 200 to 249
Sucess for indices 250 to 299
Sucess for indices 300 to 349
Sucess for indices 350 to 399
Sucess for indices 400 to 449
Sucess for indices 450 to 499
Sucess for indices 500 to 549
Sucess for indices 550 to 599
Sucess for indices 600 to 649
Sucess for indices 650 to 699
Sucess for indices 700 to 749
Sucess for indices 750 to 799
Sucess for indices 800 to 849
Sucess for indices 850 to 899
Sucess for indices 900 to 949
Sucess for indices 950 to 999
Sucess for indices 1000 to 1049
Sucess for indices 1050 to 1099
Sucess for indices 1100 to 1149
Sucess for indices 1150 to 1199
Sucess for indices 1200 to 1249
Sucess for indices 1250 to 1299
Sucess for indices 1300 to 1349
Sucess for indices 1350 to 1399
Sucess for indices 1400 to 1449
Sucess for indices 1450 to 1499
Sucess for indices 1500 to 1549
Sucess for indices 1550 to 1599
Sucess for indices 16

In [19]:
for start in range(0, len(unique_spotify_ids), max_feats_per_call):
    end = start + max_feats_per_call
    audio_feat_results = sp.audio_features(unique_spotify_ids[start:end])
    
    if len(audio_feat_results) > 0:  
        with open(f'data/MUSIC4ALL/crawled/audio_feat_results_{start}_{end-1}.json', 'w') as f:
            json.dump(audio_feat_results, f)
        print(f"Sucess")
    else:
        print(f"Error for indices {start} to {end-1}")
        
    time.sleep(min_wait_time_per_call)

Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
Sucess
