In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob
from essentia.standard import MusicExtractor
from essentia.standard import YamlOutput

In [9]:
import essentia
print(essentia.__version__)

2.1-beta6-dev


# Extract essentia audio features

In [2]:
extractor = MusicExtractor(
    # extract mean and standard deviation (if applicable) for each feature
    lowlevelStats=['mean','stdev'],
    rhythmStats=['mean','stdev'],
    tonalStats=['mean','stdev']
)

def extract_audio_features(input_dir, output_dir):
    """
    Extract audio features from a directory of pre-downloaded tracks (.mp3)
    The extracted feature for each song is saved as a json file in a predefined directory
    """
    os.makedirs(output_dir, exist_ok=True)

    files = glob.glob(input_dir + "*.mp3")
    print(files)

    for file in files:
        features, feature_names = extractor(file)
        base_name = os.path.splitext(os.path.basename(file))[0]
        results_file = os.path.join(output_dir, f"{base_name}.json")
        YamlOutput(filename=results_file, format="json")(features)
        print(f"Saved: {results_file}")

In [4]:
'''
NOTE: Running this block may take hours

DOWNLOADED_SONGS = os.path.join("..", "data", "downloaded_tracks")
OUTPUT_DIR = os.path.join("..", "data", "extracted_audio_features")
extract_audio_features(input_dir=DOWNLOADED_SONGS, ouput_dir=OUTPUT_DIR)
'''

'\nNOTE: Running this block may take hours\n\nDOWNLOADED_SONGS = os.path.join("..", "data", "downloaded_tracks")\nOUTPUT_DIR = os.path.join("..", "data", "extracted_audio_features")\nextract_audio_features(input_dir=DOWNLOADED_SONGS, ouput_dir=OUTPUT_DIR)\n'

# Compile extracted features into a DataFrame

In [3]:
def filter_metadata(metadata_dict):
    '''
    Extracted intested metadata items
    '''
    interested_keys = {"file_name", "artist", "audiosourcewebpage", "title"}
    base_dict = metadata_dict['tags']
    filtered_dict = {key: base_dict[key] for key in base_dict if key in interested_keys}
    return filtered_dict


def parse_nested_dict(d, root_k, flat_dict):
    '''
    Helper function for parse_dict'''
    for k, v in d.items():
        if isinstance(v, (float, list, str)): 
            flat_dict[f"{root_k}.{k}"] = v
        elif isinstance(v, dict):
            parse_nested_dict(v, f"{root_k}.{k}", flat_dict)
    return flat_dict

def parse_dict(d):
    '''
    Flatten all items within nested dictionaries
    '''
    flat_dict = {}

    if isinstance(d.get("metadata"), dict):
        flat_dict.update(filter_metadata(d["metadata"]))

    for section in ("lowlevel", "tonal", "rhythm"):
        if isinstance(d.get(section), dict):
            parse_nested_dict(d[section], section, flat_dict)

    for k, v in d.items():
        if k in {"metadata", "lowlevel", "tonal", "rhythm"}:
            continue
        if isinstance(v, (float, list, str)):
            flat_dict[k] = v
        elif isinstance(v, (dict)):
            parse_nested_dict(v, k, flat_dict)

    return flat_dict

def compile_json_to_df(json_files):
    rows = []

    for file in json_files:
        with open(file) as f:
            d = json.load(f)
        flat_dict = parse_dict(d)
        rows.append(flat_dict)
    
    df = pd.DataFrame(rows)
    return df

def remove_list(df, col):
    '''
    Strip lists around feature values
    '''
    df[col] = df[col].apply(
    lambda x: ", ".join(map(str, x)) if isinstance(x, list) else x)
    return df


In [10]:
ext_features_files = glob.glob("../data/extracted_audio_features/*.json")
df_audio_features = compile_json_to_df(ext_features_files)

df_audio_features = remove_list(df_audio_features, "artist")
df_audio_features = remove_list(df_audio_features, "audiosourcewebpage")
df_audio_features = remove_list(df_audio_features, "title")

In [11]:
df_audio_features.head()

Unnamed: 0,file_name,artist,audiosourcewebpage,title,lowlevel.average_loudness,lowlevel.barkbands_crest.mean,lowlevel.barkbands_crest.stdev,lowlevel.barkbands_flatness_db.mean,lowlevel.barkbands_flatness_db.stdev,lowlevel.barkbands_kurtosis.mean,...,rhythm.bpm_histogram_second_peak_weight,rhythm.danceability,rhythm.onset_rate,rhythm.beats_loudness_band_ratio.mean,rhythm.beats_loudness_band_ratio.stdev,rhythm.beats_position,rhythm.bpm_histogram,rhythm.bpm_histogram_second_peak_spread,lowlevel.silence_rate_20dB.mean,lowlevel.silence_rate_20dB.stdev
0,Deli Girls - I'd Rather Die.mp3,Deli Girls,https://open.spotify.com/track/4Lbktkg0Ib1bFDu...,I'd Rather Die,0.929919,10.570652,4.896773,0.179815,0.093503,8.28299,...,0.036424,1.197739,2.885155,"[0.525498270988, 0.15839292109, 0.164387211204...","[0.345377117395, 0.169016778469, 0.18099318444...","[0.684988677502, 1.38158726692, 2.07818579674,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,
1,Linkin Park - One More Light.mp3,Linkin Park,https://open.spotify.com/track/3xXBsjrbG1xQIm1...,One More Light,0.635731,12.869395,4.547413,0.274933,0.09196,8.548099,...,0.17762,1.007964,2.740287,"[0.467880964279, 0.250412672758, 0.29002258181...","[0.243939742446, 0.222770571709, 0.21235983073...","[0.731428563595, 1.46285712719, 2.18267560005,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,
2,Taylor Swift - Fearless.mp3,Taylor Swift,https://open.spotify.com/track/2okho7vU7Nsq1UZ...,Fearless,0.923186,9.609028,4.686822,0.122646,0.069233,6.100173,...,0.024752,1.049551,3.574061,"[0.386991381645, 0.199895650148, 0.18542943894...","[0.240712150931, 0.151186391711, 0.14939081668...","[0.568888902664, 1.16099774837, 1.764716506, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.411765,,
3,Mega Mango - You Spent All Your Love - Anniver...,Mega Mango,https://open.spotify.com/track/58i5wpN3tZUjHv5...,You Spent All Your Love - Anniversary Edition,0.83784,12.142623,5.341847,0.170481,0.087576,15.025992,...,0.061041,0.946623,2.438527,"[0.533605277538, 0.191193535924, 0.15869387984...","[0.266846597195, 0.196636393666, 0.16053113341...","[0.66176867485, 1.31192743778, 1.97369611263, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.227273,,
4,Siouxsie and the Banshees - Spellbound.mp3,Siouxsie and the Banshees,https://open.spotify.com/track/5Ng6UbryNd3eds2...,Spellbound,0.918951,10.099307,4.077002,0.140351,0.052854,3.780532,...,0.02686,1.284117,4.06531,"[0.559505045414, 0.148561567068, 0.12635271251...","[0.191182538867, 0.111118733883, 0.12651880085...","[0.394739210606, 0.801088392735, 1.20743763447...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.48,,


In [21]:
len(df_audio_features.iloc[:, 4:].columns)

141

In [8]:
df_audio_features.to_csv("../outputs/df_audio_features.csv")