In [None]:
"""source: https://github.com/dsholes/python-warpdrive/blob/master/warpdrive.py"""


import os
import librosa
import numpy as np
import soundfile as sf
import json
import sys
from pathlib import Path
from scipy.io import wavfile

In [None]:
ROOT_DIR = os.path.dirname(os.getcwd())
DATA_FOLDER = os.path.join(ROOT_DIR, "data")

In [None]:
audio_keparoicamL_A_path = os.path.join(DATA_FOLDER, "keparoicam_left_test_A.wav")
audio_keparoicamR_A_path = os.path.join(DATA_FOLDER, "keparoicam_right_test_A.wav")

audio_keparoicamL_B_path = os.path.join(DATA_FOLDER, "keparoicam_left_test_B.wav")
audio_keparoicamR_B_path = os.path.join(DATA_FOLDER, "keparoicam_right_test_B.wav")

audio_keparoicamL_C_path = os.path.join(DATA_FOLDER, "keparoicam_left_test_C.wav")
audio_keparoicamR_C_path = os.path.join(DATA_FOLDER, "keparoicam_right_test_C.wav")

In [None]:
target_left = audio_keparoicamL_B_path
target_right = audio_keparoicamR_B_path

In [None]:
def dtw_shift_param(sig1, sig2, sr):
    """
    Find warping parameters for time shift calculation using Dynamic
    Time Warping (DTW) algorithm from `librosa` package.
    """
    # Code taken from librosa docs
    # Changed metric to 'euclidean', much more robust
    # But Why?

    x_1 = sig1
    x_2 = sig2
    n_fft = int((sr/10.)*2.)
    hop_size = int(n_fft/2.)

    x_1_chroma = librosa.feature.chroma_stft(y=x_1, sr=sr, tuning=0,
                                             norm=2, hop_length=hop_size,
                                             n_fft=n_fft)
    x_2_chroma = librosa.feature.chroma_stft(y=x_2, sr=sr, tuning=0,
                                             norm=2, hop_length=hop_size,
                                             n_fft=n_fft)

    D, wp = librosa.sequence.dtw(X=x_1_chroma, Y=x_2_chroma,
                                 metric='euclidean')
    return (wp, hop_size)

In [None]:
def pseudo_hist_time_shift(wp, sr, hop_size):
    """
    Build Pseudo Histogram to select "mode" of time shift data.

    Most common time shift treated as actual time shift.

    Need proper test to determine confidence in result.
    """
    tdiff_unitless = wp[:,0] - wp[:,1]
    tdiff_unique, tdiff_count = np.unique(tdiff_unitless,
                                          return_counts=True)
    tdiff_sec = tdiff_unique * hop_size / sr

    return (tdiff_sec, tdiff_count)

In [None]:
def find_delay_sec(sig1, sig2, sr):
    """
    Return Time Shift between signals in seconds. Note signals must
    have same sample rate
    """
    # Use Dynamic Time Warping (DTW)
    wp, hop_size = dtw_shift_param(sig1, sig2, sr)

    # Build Pseudo Histogram of time shift "guesses"
    tdiff_sec, tdiff_count = pseudo_hist_time_shift(wp, sr, hop_size)

    # Need a better confidence metric...
    count_argmax = tdiff_count.argmax()
    nearest_argmax_idx = np.array([count_argmax - 1,
                                   count_argmax,
                                   count_argmax + 1])
    nearest_counts = tdiff_count[nearest_argmax_idx]
    nearest_tdiff = tdiff_sec[nearest_argmax_idx]
    confidence = nearest_counts.sum()/tdiff_count.sum()

    # Weighted average of peak and 2 nearest neighbors
    time_shift = (nearest_tdiff*nearest_counts).sum()/nearest_counts.sum()
    return (time_shift, confidence)

In [None]:
audio1, samplerate1 = librosa.load(target_left, sr=None)
audio2, samplerate2 = librosa.load(target_right, sr=None)

assert samplerate1 == samplerate2

delay = find_delay_sec(audio1, audio2, samplerate1)
print(delay)