In [1]:
import librosa
import pyaudio
import queue
from typing import Optional
import numpy as np
import time
import matplotlib.pyplot as plt

SAMPLE_RATE = 22050
HOP_LENGTH = 256
CHANNELS = 1
N_FFT = 512

In [4]:
class StreamProcessor:
    def __init__(self, sample_rate=SAMPLE_RATE, chunk_size=2048):
        self.chunk_size = chunk_size
        self.channels = CHANNELS
        self.sample_rate = sample_rate
        self.format = pyaudio.paFloat32
        self.audio_interface: Optional[pyaudio.PyAudio] = None
        self.audio_stream: Optional[pyaudio.Stream] = None
        self.buffer = queue.Queue()
        self.chroma_buffer = queue.Queue()
        self.last_chunk = None
        self.is_mic_open = False

    def _process_frame(self, data, frame_count, time_into, status_flag):
        self.buffer.put(data)

        query_audio = np.frombuffer(data, dtype=np.float32)
        query_chroma_stft = librosa.feature.chroma_stft(
            y=query_audio, hop_length=HOP_LENGTH, n_fft=N_FFT
        )
        if self.last_chunk is None:  # first audio chunk is given
            self.chroma_buffer.put(query_chroma_stft[:, :-1])  # pop last frame converted with zero padding
        else:
            override_previous_padding = librosa.feature.chroma_stft(
                y=np.concatenate((self.last_chunk, query_audio[:HOP_LENGTH])),
                hop_length=HOP_LENGTH,
                n_fft=N_FFT,
            )[:, 1:-1]  # drop first and last frame converted with zero padding
            accumulated_chroma = np.concatenate((override_previous_padding, query_chroma_stft[:, 1:-1]), axis=1)
            self.chroma_buffer.put(accumulated_chroma)
        
        self.last_chunk = query_audio[query_audio.shape[0] - HOP_LENGTH:]

        return (data, pyaudio.paContinue)

    def run(self):
        self.audio_interface = pyaudio.PyAudio()
        self.audio_stream = self.audio_interface.open(
            format=self.format,
            channels=self.channels,
            rate=self.sample_rate,
            input=True,
            frames_per_buffer=self.chunk_size,
            stream_callback=self._process_frame,
        )
        self.is_mic_open = True
        self.audio_stream.start_stream()
        print("* Recording in progress....")

    def stop(self):
        if self.is_mic_open:
            self.audio_stream.stop_stream()
            self.audio_stream.close()
            self.is_mic_open = False
            self.audio_interface.terminate()
            print("Recording Stopped.")

In [18]:
from synctoolbox.dtw.mrmsdtw import sync_via_mrmsdtw
from synctoolbox.dtw.core import compute_warping_path

RECORD_SECONDS = 15
CHUNK_SIZE = 2048

chunks = np.array([])
query_chroma_stft = np.array([])

ref_audio, ref_sr = librosa.load("../resources/audio/target/Happy_Birthday_To_You_C_Major.wav")
ref_chroma_stft = librosa.feature.chroma_stft(y=ref_audio, sr=ref_sr, hop_length=HOP_LENGTH, n_fft=N_FFT)
cost_matrix = None
step_index_matrix = None
warping_path = None

sp = StreamProcessor(SAMPLE_RATE, CHUNK_SIZE)
sp.run()

start = time.time()
for _ in range(int(SAMPLE_RATE / CHUNK_SIZE * RECORD_SECONDS)):
    chroma_stft = sp.chroma_buffer.get()  # (12, 8)
    
    query_chroma_stft = np.concatenate((query_chroma_stft, chroma_stft), axis=1) if query_chroma_stft.any() else chroma_stft
    query_pointer = query_chroma_stft.shape[1]
    alignment = sync_via_mrmsdtw(
        f_chroma1=ref_chroma_stft[:, :query_pointer],
        f_chroma2=query_chroma_stft,
    )
    D, E, wp = compute_warping_path(alignment)
    cost_matrix = D
    step_index_matrix = E
    warping_path = wp


end = time.time()
sp.stop()
duration = end - start
print(f"duration: {duration}")

plt.plot(warping_path[:, 0], warping_path[:, 1])
plt.show()

* Recording in progress....


  return f(*args, **kwargs)


AssertionError: Failed in nopython mode pipeline (step: nopython rewrites)
key already in dictionary: 'index.5'