# Load Relevant Packages / Modules

In [22]:
%matplotlib inline
from pathlib import Path
import numpy as np, scipy, matplotlib.pyplot as plt, IPython.display as ipd
import librosa, librosa.display

import warnings

In [2]:
cd ../libs

/home/jeb/Documents/columbia_data_science/big_data/project/music_emotion_recognition/libs


In [3]:
from rp_extract.rp_extract import rp_extract
from rp_extract.audiofile_read import *
from rp_extract.rp_plot import *

In [4]:
cd ../notebooks

/home/jeb/Documents/columbia_data_science/big_data/project/music_emotion_recognition/notebooks


In [142]:
def temporal_centroid(envelope):
    """computes the temporal centroid of an onset envelope"""
    D = np.abs(librosa.stft(envelope))
    times = librosa.times_like(D)

    onset_strength = librosa.onset.onset_strength(y=envelope, sr=sr)
    
    try:
        temporal_centroid = sum(onset_strength * times) / sum(onset_strength)
    except RuntimeWarning:
        temporal_centroid = np.nan
    
    return temporal_centroid


def log_attack_time(envelope, sr, thresh_percent):
    D = np.abs(librosa.stft(envelope))
    times = librosa.times_like(D)
    onset_strength = librosa.onset.onset_strength(y=envelope, sr=sr)
    
    
    stop_attack_index = np.argmax(onset_strength)
    stop_attack_value = envelope[stop_attack_index]
    thresh = stop_attack_value * thresh_percent / 100
    
    try:
        start_attack_index = [x > thresh for x in onset_strength].index(True)
    except ValueError:
        return np.nan
    
    if start_attack_index == stop_attack_index:
        start_attack_index -= 1

    log_attack_time =  np.log10(times[stop_attack_index] - times[start_attack_index])
    
    return log_attack_time


def extract_features(signal, sr):
    """Given and a signal and its sampling rate, compute all the features"""
    
    # Temporal Features
    onset_samples = np.unique(librosa.onset.onset_detect(signal, sr=sr, backtrack=True, units='samples'))
    all_envelopes = np.split(signal, onset_samples)
    
    zero_crossings = np.array([sum(librosa.zero_crossings(x, pad=False)) for x in all_envelopes])
    zero_features = np.array([np.mean(zero_crossings), np.std(zero_crossings)])
    
    temporal_centroids = np.array([temporal_centroid(x) for x in all_envelopes])
    temporal_centroids = temporal_centroids[~np.isnan(temporal_centroids)]
    temporal_cen_features = np.array([np.mean(temporal_centroids), np.std(temporal_centroids)])

    log_attacks = np.array([log_attack_time(x, sr, 50) for x in all_envelopes])
    log_attacks = log_attacks[~np.isnan(log_attacks)]
    log_attack_features = np.array([np.mean(log_attacks), np.std(log_attacks)])
     
    # Rhythmic Feautres (without dimension reduction for now)
    rhythm = rp_extract(signal, sr, extract_rh=True, transform_db=True, transform_phon=True, transform_sone=True,          
        fluctuation_strength_weighting=True, 
        skip_leadin_fadeout=1,             
        step_width=1)
    rhythm_hist = rhythm['rh']
    rhythm_mean = np.array([np.mean(rhythm_hist)])
    
    all_features = np.concatenate([zero_features, temporal_cen_features, log_attack_features, rhythm_hist, rhythm_mean])

    return all_features

# Construct "Dictionary of Audio Features of the Training Pieces"

To start we'll train on all but one song and use that song as our test (one hold out cv).

Our "dictionary" will be PXQ where P = Number of features (67 in this example) and Q = Number of training samples (743 in this example)

In [143]:
# Read all of our music data and create features
warnings.filterwarnings('ignore')

audio_path = Path.cwd().parent / 'data' / 'raw' / 'clips_45seconds'
all_mp3_paths = list(audio_path.glob('**/*.mp3'))

audio_train = [] 
for path in all_mp3_paths:
    if len(audio_train) < 25: #we'll start off with only 25 training pieces to save time
        signal, sr = librosa.load(str(path))
        try:
            song_features = extract_features(signal, sr)
            audio_train.append(song_features)
        except ValueError as e:
            print(path)
            continue
    else:
        break

audio_train = np.array(audio_train).T

# Emotion Mapping Factors Learning

We'll use the KNN algorithm with k=4. **I have to go back and scale the data**

In [144]:
from sklearn.neighbors import NearestNeighbors

In [145]:
# Take our training and testing, and reshape for the KNN algorithm

y = audio_train[0, :]
X = audio_train[1:, :]

In [146]:
def map_factors(train, test):
    """ Returns mapping factors vector based on 4-NN algorithm
    train - a 2d np array of training data
    test - a single test vector of data to
    returns: a np array of the mapping factors of test
    """
    nbrs = NearestNeighbors(n_neighbors=4, algorithm='ball_tree').fit(train)
    _, indices = nbrs.kneighbors(test.reshape(-1, 1).T)
    
    map_fac = np.zeros(train.shape[0])
    map_fac[indices] = 1
    
    return map_facnormalize

In [147]:
map_factors_test = map_factors(X, y)

print("Passes sum test: {}".format(sum(map_factors_test) == 4.0))
print("Passes shape test: {}".format(map_factors_test.shape == (24,)))

Passes sum test: True
Passes shape test: False


# Emotion Space Mapping