In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import Audio
import numpy as np

In [None]:
from functools import partial

In [None]:
import essentia
import essentia.standard as es
import librosa

In [None]:
# audio_file = '/media/Samples/SOL_0.9_HQ/Winds/Flute/ordinario/Fl-ord-C5-mf.wav'
audio_file = "/home/kureta/Music/chorales/01-[Vierstimmige Chorgesänge]-=Hilf,Gott,dass mir's gelinge=,BWV 343.mp3"

In [None]:
sample_rate = 44100
frame_length = 1024
hop_length = 512
lowest_note = 'c2'
highest_note = 'e6'
max_freq = librosa.note_to_hz(highest_note)
min_freq = librosa.note_to_hz(lowest_note)
pitch_range = librosa.note_to_midi(highest_note) - librosa.note_to_midi(lowest_note) + 1
fft_frequencies = librosa.fft_frequencies(sr=44100, n_fft=1024)
fft_bin_to_freq = lambda x: fft_frequencies[x]
frames_to_duration = partial(librosa.frames_to_time, sr=44100, hop_length=512, n_fft=1024)
silence_threshold = 36  # db

In [None]:
loader = essentia.standard.MonoLoader(filename=audio_file, sampleRate=sample_rate)
audio = loader()
audio = librosa.util.normalize(audio)
audio, _ = librosa.effects.trim(audio, top_db=silence_threshold, frame_length=frame_length, hop_length=hop_length)

In [None]:
def plot_signal(signal):
    plt.rcParams['figure.figsize'] = (19, 6)

    fig, ax1 = plt.subplots(1, 1)
    ax1.set_xticks(np.arange(0, len(signal)+1, len(signal)/10))
    ax1.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(lambda x, _: f'{x/sample_rate:.2f}'))
    ax1.plot(signal)
    pass

In [None]:
plot_signal(audio)

In [None]:
Audio(data=audio, rate=sample_rate)

In [None]:
import pyworld as pw

In [None]:
# 1. A convient way
f0, sp, ap = pw.wav2world(audio.astype(np.double), sample_rate)    # use default options

In [None]:
ap = np.zeros_like(ap)

In [None]:
plot_signal(f0)

In [None]:
from scipy.signal import savgol_filter

In [None]:
f0_hat = savgol_filter(f0, 51, 3)

In [None]:
f0_hat[:] = 110.

In [None]:
plot_signal(f0_hat)

In [None]:
y = pw.synthesize(f0_hat, sp, ap, sample_rate)

In [None]:
plot_signal(y)

In [None]:
Audio(data=y, rate=sample_rate)

In [None]:
window = es.Windowing(type = 'hann')
get_spectrum = es.Spectrum()
get_melodia_pitches = es.PredominantPitchMelodia(maxFrequency=max_freq, minFrequency=min_freq,
                                                 frameSize=frame_length, hopSize=hop_length, guessUnvoiced=True)
pitch_filter = es.PitchFilter(useAbsolutePitchConfidence=True)
eq_loudness = es.EqualLoudness(sampleRate=sample_rate)
get_loudness = es.Loudness()

spectra = []
loudnesses = []

for frame in es.FrameGenerator(audio, frameSize=frame_length, hopSize=hop_length, startFromZero=True):
    windowed_frame = window(frame)
    spectra.append(get_spectrum(windowed_frame))
    loudnesses.append(get_loudness(windowed_frame))

spectra = essentia.array(spectra)
loudnesses = essentia.array(loudnesses)

pitches, confidences = get_melodia_pitches(eq_loudness(audio))
filtered_pitches = pitch_filter(pitches, confidences)

In [None]:
def plot_spectrum(spect):
    plt.rcParams['figure.figsize'] = (19, 6)

    fig, ax1 = plt.subplots(1, 1)
    ax1.set_xticks(np.arange(0, spect.shape[0]+1, spect.shape[0]/10))
    ax1.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(lambda x, _: f'{frames_to_duration(x):.2f}'))
    ax1.set_yticks(np.arange(0, spect.shape[1], spect.shape[1]//10))
    ax1.yaxis.set_major_formatter(mpl.ticker.FuncFormatter(lambda x, _: f'{fft_bin_to_freq(x):.2f}'))
    ax1.imshow(spect.T, aspect='auto', interpolation='none', origin='lower')
    pass

In [None]:
plot_spectrum(spectra)

In [None]:
def plot_feature(feature):
    plt.rcParams['figure.figsize'] = (19, 6)

    fig, ax1 = plt.subplots(1, 1)
    ax1.set_xticks(np.arange(0, len(feature)+1, len(feature)/10))
    ax1.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(lambda x, _: f'{frames_to_duration(x):.2f}'))
    ax1.plot(feature)
    pass

In [None]:
# plot_feature(librosa.hz_to_midi(pitches))
plot_feature(librosa.hz_to_midi(filtered_pitches[1:-1]))
plot_feature(np.abs(confidences[1:-1]))
plot_feature(loudnesses)

In [None]:
Audio(data=audio, rate=sample_rate)

In [None]:
filtered_pitches[1:-1].shape, loudnesses.shape, confidences[1:-1].shape