# Feature Extraction - package essentia

In this notebook, features are extracted using the essentia package. More details about Melodia can be found in that [tutorial](https://essentia.upf.edu/essentia_python_examples.html). 

## Packages

In [1]:
# general
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm
import warnings

# essentia
import essentia
from essentia.standard import Vibrato
import essentia.standard as es

# plots
import matplotlib.pyplot as plt

[   INFO   ] MusicExtractorSVM: no classifier models were configured by default


## Functions

In [2]:
def extract_features_essentia(audio_file):
    
    features, features_frames = es.MusicExtractor(lowlevelStats=['mean', 'stdev'],
                                                  rhythmStats=['mean', 'stdev'],
                                                  tonalStats=['mean', 'stdev'])(audio_file)

    # delete non-scalar features
    feature_names = features.descriptorNames()
    features_meta = [bool(re.search('metadata', feature_names[i])) for i in range(len(feature_names))]
    feature_names = [feature_names[i] for i in range(len(feature_names)) if not features_meta[i]]
    features_tonal_key= [bool(re.search('tonal.key', feature_names[i])) for i in range(len(feature_names))]
    feature_names = [feature_names[i] for i in range(len(feature_names)) if not features_tonal_key[i]]
    features_tonal_key= [bool(re.search('tonal.chords', feature_names[i])) for i in range(len(feature_names))]
    feature_names = [feature_names[i] for i in range(len(feature_names)) if not features_tonal_key[i]]
    feature_names = [feature_name for feature_name in feature_names if np.isscalar(features[feature_name])]
    features = [features[feature_name] for feature_name in feature_names]
      
    df = pd.DataFrame(features).T
    df.columns = [feature_names[i]+'_essentia' for i in range(len(feature_names))]
    
    return df

In [3]:
def extract_features(file_paths):

    df = pd.DataFrame()

    for file_path in tqdm(file_paths):
        df_tmp = extract_features_essentia(file_path)
        df_tmp[['file_path']] = file_path
        df_tmp[['song_id']] = re.findall('[0-9]+', file_path)[0]
        df = pd.concat([df, df_tmp], ignore_index = True)

    return df

## List of files

In [None]:
audio = 'mirex/dataset/Audio'
music_directory = os.listdir(audio)

In [5]:
file_paths = []
for file_name in music_directory:
    # Create the full file path using os.path.join()
    file_path = os.path.join(audio, file_name)
    
    # Add the file path to the list
    file_paths.append(file_path)
    
    
def extract_numeric_part(file_path):
    return [int(s) for s in os.path.basename(file_path).split('.') if s.isdigit()][0]

# Sort the file paths based on the numeric values in the file names
file_paths = sorted(file_paths, key=extract_numeric_part)
# Delete 'zone identifier':
file_paths = [file_path for file_path in file_paths if not 'Zone.Identifier' in file_path]

## Extract features and save

In [6]:
# anoying pandas warning (concat empy df)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [7]:
df = extract_features(file_paths)

  0%|          | 0/903 [00:00<?, ?it/s]

[   INFO   ] MusicExtractor: Read metadata
[   INFO   ] MusicExtractor: Compute md5 audio hash, codec, length, and EBU 128 loudness
[   INFO   ] MusicExtractor: Replay gain
[   INFO   ] MusicExtractor: Compute audio features
[   INFO   ] MusicExtractor: Compute aggregation
[   INFO   ] All done
  0%|          | 1/903 [00:01<23:21,  1.55s/it][   INFO   ] MusicExtractor: Read metadata
  0%|          | 2/903 [00:04<32:15,  2.15s/it][   INFO   ] MusicExtractor: Compute md5 audio hash, codec, length, and EBU 128 loudness
[   INFO   ] MusicExtractor: Replay gain
[   INFO   ] MusicExtractor: Compute audio features
[   INFO   ] MusicExtractor: Compute aggregation
[   INFO   ] All done
[   INFO   ] MusicExtractor: Read metadata
[   INFO   ] MusicExtractor: Compute md5 audio hash, codec, length, and EBU 128 loudness
[   INFO   ] MusicExtractor: Replay gain
[   INFO   ] MusicExtractor: Compute audio features
[   INFO   ] MusicExtractor: Compute aggregation
[   INFO   ] All done
  0%|          | 3

In [10]:
df.to_csv('data/audio/preprocessed_EF.csv')