In [1]:
%load_ext autoreload

In [16]:
%autoreload 2
import librosa
import os, json, glob
import matplotlib.pyplot as plt
import librosa, mir_eval
import numpy as np
import scipy
from f0_analysis_utils import *

## Prepare data for training PT classifier
According to the annotations, split the mono audio segments into note events. 

Without candidate selection, 

directly classify the note events into:
* normal
* bend/release/prebend... (this will be further analyzed using signal processing)
* vibrato

directly classify the transitions (100 ms around the offset of note events) into:
* normal
* hammer/pull... (this will be further analyzed using signal processing)
* slide

Two classifiers, each distinguish between three classes.

### The code for generating the data matrix

In [None]:
def extract_features(note_audio):
    return np.random.random((10,))
def extract_tran_features(tran_audio):
    return np.random.random((10,))

In [24]:
notes_data = [] # huge matrix with features and labels for note events (normal, bend/release, vibrato)
trans_data = [] # huge matrix with features and labels for note events (normal, hm/po, slide)

file = "Guns N' Roses - Welcome To The Jungle (ver 3)_Solo Guitar_0_0s.wav"
y, sr = librosa.load(os.path.join(FILTERED_AUDIO_DIR, file), sr=None)

anno_file = find_anno(file)
with open(anno_file) as anno:
    annotation = json.load(anno)
for i in range(len(annotation)):
    note_event = annotation[i]
    onset = note_event["time"]["start"]
    offset = onset + note_event["time"]["dur"]
    onset_sp = librosa.time_to_samples(onset, sr=sr)
    offset_sp = librosa.time_to_samples(offset, sr=sr)
    note_audio = y[onset_sp : offset_sp]
    feature = extract_features(note_audio)
    if note_event["effects"]["bend"]:
        label = 1
    elif note_event["effects"]["vibrato"]:
        label = 2
    else:
        label = 0
    feature_w_label = np.append(feature, label)
    notes_data.append(feature_w_label)

    # if the note event is the last one in the file, ignore its transitions 
    # because the transition won't be captured in the audio
    if i == len(annotation) - 1:
        break
    tran_onset_sp = offset_sp - librosa.time_to_samples(0.05, sr=sr)
    tran_offset_sp = offset_sp + librosa.time_to_samples(0.05, sr=sr)
    tran_audio = y[tran_onset_sp : tran_offset_sp]
    tran_feature = extract_tran_features(tran_audio)
    if note_event["effects"]["hammer"]:
        tran_label = 1
    elif note_event["effects"]["slide"]:
        tran_label = 2
    else:
        tran_label = 0
    tran_feature_w_label = np.append(tran_feature, tran_label)
    trans_data.append(tran_feature_w_label)

notes_data = np.array(notes_data)
trans_data = np.array(trans_data)

### Computing audio features
Frame-level features aggregated for the duration of the note event. 

Statistics: mean, std, max, min, skewness, kurtosis

All stats of spectral centroid, brightness?, spread, skewness, kurtosis, flux, roll-off, entropy, irregularity, roughness, inharmonicity, zero-crossing, low-energy ratio and their 1st order difference = 156

All stats of pitch and 1st order difference = 12

Mean and std of 40-MFCC and 1st order difference = 160

In [73]:
# temporary audio/anno path 
AUDIO_DIR = "samples/audio"
ANNO_DIR = "samples/anno"

In [74]:
def get_all_stats(a):
    """Given a 1D time series, compute the 6 statistics.

    Args:
        a (array): The input time series, of the shape (n,)

    Returns:
        array: An array containing the statistics, of the shape (6,)
    """
    assert a.ndim == 1
    mean = np.mean(a)
    std = np.std(a)
    max = np.max(a)
    min = np.min(a)
    skewness = scipy.stats.skew(a, nan_policy="raise")
    kurtosis = scipy.stats.kurtosis(a, nan_policy="raise")

    stats = np.array([mean, std, max, min, skewness, kurtosis])
    assert stats.shape == (6,)
    return stats


def extract_mfccs(note_audio):
    """Given the audio signal of a note event, compute the MFCC features.
    
    The MFCC features include the mean and std of the MFCCs aggregated over the note event, 
    and the mean and std of the 1st order differences aggregated over the note event.

    Args:
        note_audio (array): The note event audio signal

    Returns:
        array: The MFCC feature vector, of the shape (80,)
    """
    mfccs = librosa.feature.mfcc(note_audio, sr=sr)
    mfccs_diff = np.diff(mfccs, n=1)
    assert mfccs.shape[1] == mfccs_diff.shape[1] + 1

    mfccs_mean = np.mean(mfccs, axis=1)
    assert mfccs_mean.shape == (20,)
    mfccs_diff_mean = np.mean(mfccs_diff, axis=1)
    assert mfccs_diff_mean.shape == (20,)

    mfccs_std = np.std(mfccs, axis=1)
    assert mfccs_std.shape == (20,)
    mfccs_diff_std = np.std(mfccs_diff, axis=1)
    assert mfccs_diff_std.shape == (20,)

    mfcc_feature = np.concatenate((mfccs_mean, mfccs_diff_mean, mfccs_std, mfccs_diff_std), axis=0)
    assert mfcc_feature.shape == (80,)
    return mfcc_feature


def extract_features(note_audio):
    """The one function that calculates all the features.

    The returned 1D feature vector includes:

    * F0s, F0 diffs
    * MFCCs, MFCC diffs
    * Spectral.timbral features and their diffs

    Args:
        note_audio (array): The audio signal of a note event.

    Returns:
        array: The final feature vector for the input note event.
    """
    # get pitch features
    # this uses fill_na=None to give a guess for unvoiced frames, so there's no NaN in f0
    f0, voiced, _ = librosa.pyin(note_audio, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("G6"), sr=sr, fill_na=None)
    # this eliminates the NaNs in the estimated F0
    # pitch = f0[voiced]
    pitch = f0
    pitch_diff = np.diff(pitch, n=1)

    assert pitch.ndim == 1
    assert pitch_diff.ndim == 1
    assert pitch.shape[0] == pitch_diff.shape[0] + 1

    # get spectral/timbral features
    centroid = np.squeeze(librosa.feature.spectral_centroid(note_audio, sr))
    bandwidth = np.squeeze(librosa.feature.spectral_bandwidth(note_audio, sr))
    flatness = np.squeeze(librosa.feature.spectral_flatness(note_audio))
    rolloff = np.squeeze(librosa.feature.spectral_rolloff(note_audio, sr))
    zero_crossing = np.squeeze(librosa.feature.zero_crossing_rate(note_audio))
    flux = librosa.onset.onset_strength(note_audio, sr)
    specs = np.array([centroid, bandwidth, flatness, rolloff, zero_crossing, flux])
    specs_diff = np.diff(specs, n=1)

    assert specs.ndim == 2
    assert specs_diff.ndim == 2
    assert specs.shape[1] == specs_diff.shape[1] + 1

    pitch_stats = get_all_stats(pitch)
    pitch_diff_stats = get_all_stats(pitch_diff)

    assert pitch_stats.shape == (6,)
    assert pitch_diff_stats.shape == (6,)

    feature = np.concatenate((pitch_stats, pitch_diff_stats), axis=0)

    for spec in specs:
        assert spec.ndim == 1
        spec_stats = get_all_stats(spec)
        assert spec_stats.shape == (6,)
        feature = np.concatenate((feature, spec_stats), axis=0)
    for spec_diff in specs_diff:
        assert spec_diff.ndim == 1
        spec_diff_stats = get_all_stats(spec_diff)
        assert spec_diff_stats.shape == (6,)
        feature = np.concatenate((feature, spec_diff_stats), axis=0)
    assert feature.shape == (84,)

    mfcc_feature = extract_mfccs(note_audio)
    feature = np.concatenate((feature, mfcc_feature), axis=0)
    assert feature.shape == (164,)

    return feature


In [76]:
notes_data = [] # huge matrix with features and labels for note events (normal, bend/release, vibrato)
trans_data = [] # huge matrix with features and labels for note events (normal, hm/po, slide)

file = "ACDC - Back In Black (ver 4 by GuitarManiac09)_Angus Young_6_13s.wav"
y, sr = librosa.load(os.path.join(AUDIO_DIR, file), sr=None)

anno_file = "samples/anno/ACDC - Back In Black (ver 4 by GuitarManiac09)_Angus Young_6.json"

with open(anno_file) as anno:
    annotation = json.load(anno)
for i in range(len(annotation)):
    note_event = annotation[i]
    onset = note_event["time"]["start"]
    offset = onset + note_event["time"]["dur"]
    onset_sp = librosa.time_to_samples(onset, sr=sr)
    offset_sp = librosa.time_to_samples(offset, sr=sr)
    note_audio = y[onset_sp : offset_sp]
    feature = extract_features(note_audio)
    if note_event["effects"]["bend"]:
        label = 1
    elif note_event["effects"]["vibrato"]:
        label = 2
    else:
        label = 0
    feature_w_label = np.append(feature, label)
    notes_data.append(feature_w_label)

    # if the note event is the last one in the file, ignore its transitions 
    # because the transition won't be captured in the audio
    if i == len(annotation) - 1:
        break
    tran_onset_sp = offset_sp - librosa.time_to_samples(0.05, sr=sr)
    tran_offset_sp = offset_sp + librosa.time_to_samples(0.05, sr=sr)
    tran_audio = y[tran_onset_sp : tran_offset_sp]
    tran_feature = extract_features(tran_audio)
    if note_event["effects"]["hammer"]:
        tran_label = 1
    elif note_event["effects"]["slide"]:
        tran_label = 2
    else:
        tran_label = 0
    tran_feature_w_label = np.append(tran_feature, tran_label)
    trans_data.append(tran_feature_w_label)

notes_data = np.array(notes_data)
trans_data = np.array(trans_data)

In [79]:
notes_data[:, -1]

array([0., 0., 0., 0., 0., 0., 0., 0.])

In [80]:
trans_data[:, -1]

array([0., 0., 0., 0., 0., 0., 0.])