In [1]:
import os
import json
import pandas as pd
import numpy as np
from IPython.display import display, HTML

# path of the dataset
# please change it accordingly
# Results will be saved in the folder
path = '/Users/jjsauma/gdrive/smc/amp/tareas/acousticbrainz/ab-duplicates1000-2016-03-02/' 
preProcessedFile = 'duplicateanalysis.csv'
parsedFile = os.path.join(path,preProcessedFile)
resultsFile = 'results.csv'

In [2]:
# found this function at: https://www.haykranen.nl/2016/02/13/handling-complex-nested-dicts-in-python/
class DictQuery(dict):
    def get(self, path, default = None):
        keys = path.split("/")
        val = None

        for key in keys:
            if val:
                if isinstance(val, list):
                    val = [ v.get(key, default) if v else None for v in val]
                else:
                    val = val.get(key, default)
            else:
                val = dict.get(self, key, default)

            if not val:
                break;

        return val


In [10]:
# Delete parsed and results files if exist
try:
    os.remove(parsedFile)
except OSError:
    pass

try:
    os.remove(resultsFile)
except OSError:
    pass

# Load data into Pandas Dataframes
# There is no need to run this if you have the preProcessedFile. If you do, you can start from the next cell

dictValues = {}
dictLists = {}
dictMislabeled = {}
i = 0
dfv = pd.DataFrame(dictValues, columns = ['id', 'folder', 'filename' ,'title', 'lenght', 'bpm', 'loudness', 'onset_rate', 'key_key', 'key_scale', 'replay_gain', 'tuning_frequency'] )
dfl = pd.DataFrame(dictValues, columns = ['id', 'filename' ,'title', 'beats_position', 'chords_histogram', 'hpcp_mean'])
#dfl = pd.DataFrame(dictLists, columns = ['id', 'filename' ,'title', 'beats_position', 'chords_histogram', 'hpcp_mean'])

parsedFile = os.path.join(path,preProcessedFile)

print 'Reading folders in: ' + path + '...'


subFolders = os.listdir(path) #for each subfolder
for subFolder in subFolders:
    #print(subFolder)
    if subFolder in ['.DS_Store', preProcessedFile, resultsFile]: 
        pass
    else:
        files = os.listdir(os.path.join(path,subFolder))
        
        # Add all duplicates to the same dataframe
        for fileName in files:
            if subFolder in ['.DS_Store', preProcessedFile, resultsFile]: 
                pass
            else:
                #print fileName
                jsonFile = open (os.path.join(path, subFolder, fileName),"r")
                jsonToPython = json.loads(jsonFile.read())
            
                dictValues['id'] = DictQuery(jsonToPython).get("metadata/tags/musicbrainz_recordingid")
                dictValues['folder'] = subFolder
                dictValues['filename'] = fileName
                dictValues['title'] = DictQuery(jsonToPython).get("metadata/tags/title")
                dictValues['lenght'] = DictQuery(jsonToPython).get("metadata/audio_properties/length")
                dictValues['bpm'] = DictQuery(jsonToPython).get("rhythm/bpm")
                dictValues['loudness'] = DictQuery(jsonToPython).get("lowlevel/average_loudness")
                dictValues['onset_rate'] = DictQuery(jsonToPython).get("rhythm/onset_rate")
                dictValues['key_key'] = DictQuery(jsonToPython).get("tonal/key_key")
                dictValues['key_scale'] = DictQuery(jsonToPython).get("tonal/key_scale")
                dictValues['replay_gain'] = DictQuery(jsonToPython).get("metadata/audio_properties/replay_gain")
                dictValues['tuning_frequency'] = DictQuery(jsonToPython).get("tonal/tuning_frequency")
                dfv.loc[i]= (dictValues) # Dataframe for scalar values
                
                #dictLists['id'] = DictQuery(jsonToPython).get("metadata/tags/musicbrainz_recordingid")
                #dictLists['filename'] = fileName
                #dictLists['title'] = DictQuery(jsonToPython).get("metadata/tags/title")
                #dictLists['beats_position'] = DictQuery(jsonToPython).get("rhythm/beats_position") # list
                #dictLists['chords_histogram'] = DictQuery(jsonToPython).get("tonal/chords_histogram") # list
                #dictLists['hpcp_mean'] = DictQuery(jsonToPython).get("tonal/hpcp/mean")  # list
                #dfl.loc[i]= (dictLists) # Dataframe for vector values
                i = i + 1

# Write parsed scalars to parsed file
dfv.to_csv(parsedFile)
print 'Parsed file written to: ' + parsedFile

Reading folders in: /Users/jjsauma/gdrive/smc/amp/tareas/acousticbrainz/ab-duplicates1000-2016-03-02/...
Parsed file written to: /Users/jjsauma/gdrive/smc/amp/tareas/acousticbrainz/ab-duplicates1000-2016-03-02/duplicateanalysis.csv


In [38]:
# Reading from pre parsed file #### Start from here if parsedFile supplied or already generated
dfv = pd.read_csv(parsedFile)

In [39]:
# For each song folder do the actual analysis
for gFolder in dfv.groupby(['id']):
    
    # Label files where lenght is between 30 seconds range from mean 
    dfv['Length_Ok'] = np.where(abs(dfv.lenght - dfv.lenght.median()) < 30, True, False)
    dfv['Lengh_Mean_Dist'] = abs(dfv.lenght - dfv.lenght.median()) # Computing distance from lenght Mean

    # Label files where bpm is in a 5 beat range from median bpm
    dfv['BPM_Ok'] = np.where(abs(dfv.bpm - dfv.bpm.median()) < 5, True, False)
    dfv['BPM_Mean_Dist'] = abs(dfv.bpm - dfv.bpm.median()) #Computing distance from bpm mean
    
    # Label files where loudness is in the range of first stanard deviation
    dfv['Loudness_Ok'] = np.where((dfv.loudness < abs(dfv.loudness.std() - dfv.loudness)), True, False)
    dfv['Loudness_1st_STD_Dist'] = abs(dfv.loudness.std() - dfv.loudness)
                               
    # Label files where key_key is not the mode
    dfv['Key_Ok'] = np.where(dfv.key_key == dfv.key_key.mode()[0], True, False)

In [40]:
# Export mislabeled entries to a csv file

try:
    os.remove(os.path.join(path,resultsFile))
except OSError:
    pass

mislabeled = dfv[(dfv['Length_Ok'] == False) & (dfv['BPM_Ok'] == False) & (dfv['Key_Ok'] == False)] # & (dfv['Loudness_Ok'] == False)] 
mislabeled.to_csv(os.path.join(path,resultsFile))

In [41]:

dfv.groupby(['id','Length_Ok', 'BPM_Ok', 'Key_Ok', 'Loudness_Ok']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 0,folder,filename,title,lenght,bpm,loudness,onset_rate,key_key,key_scale,replay_gain,tuning_frequency,Lengh_Mean_Dist,BPM_Mean_Dist,Loudness_1st_STD_Dist
id,Length_Ok,BPM_Ok,Key_Ok,Loudness_Ok,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
[u'00062658-acfc-4bdf-806f-aa6ec85e8ddd'],True,False,False,False,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26
[u'00062658-acfc-4bdf-806f-aa6ec85e8ddd'],True,True,True,False,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
[u'003304eb-5bb9-498e-8185-c6b220e75692'],False,False,False,False,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
[u'003304eb-5bb9-498e-8185-c6b220e75692'],True,False,False,False,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
[u'0035d310-3bfc-46c7-9cb7-93e77dd5e171'],True,False,False,False,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40
[u'0035d310-3bfc-46c7-9cb7-93e77dd5e171'],True,False,True,False,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
[u'0069c0f7-9274-4b4b-a10e-110b90a4bcc0'],True,True,False,False,36,36,36,36,36,36,36,36,36,36,36,36,36,36,36
[u'00c47ea6-3a10-4a32-b1f1-990ac756c6a0'],False,True,False,False,82,82,82,82,82,82,82,82,82,82,82,82,82,82,82
[u'00c47ea6-3a10-4a32-b1f1-990ac756c6a0'],False,True,True,False,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12
[u'00e37446-2e4c-409a-a8a1-ed94f1b01a57'],False,False,False,False,43,43,43,43,43,43,43,43,43,43,43,43,43,43,43


In [None]:
# print the std bell curve