# Record Audio

In [37]:
import pyaudio

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 10

# audio_buffer=b''
audio_buffer=[]

p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("* recording")

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    # audio_buffer += data
    audio_buffer.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

* recording
* done recording


In [38]:
audio_buffer_copy=audio_buffer.copy()

In [366]:
audio_buffer=audio_buffer_copy.copy()

In [23]:
# Join Entire data from audio_buffer
audio_data_bytes = b''.join(audio_buffer)

In [4]:
len(audio_data_bytes)

880640

# Testing VADIterator

In [55]:
from silero_vad import SileroVAD, VADIterator

# Initialize SileroVAD
onnx_model = './silero_vad.onnx'
vad = SileroVAD(onnx_model=onnx_model)

In [56]:
vad_iterator = VADIterator()

In [57]:
import numpy as np

audio_data = np.frombuffer(audio_data_bytes, dtype=np.int16)

for timestamp, frame in vad_iterator(audio_data, use_energy=False, return_seconds=False):
    if timestamp:
        print(timestamp, frame)

{'start': 20416} []
{'end': 24640} []
{'start': 29632} []
{'end': 48704} []
{'start': 65472} []
{'end': 85568} []
{'start': 140224} []
{'end': 158272} []
{'start': 408000} []
{'end': 424512} [-286. -328. -318. -265. -195. -139. -118. -135. -187. -251. -300. -314.
 -292. -254. -223. -219. -233. -251. -254. -235. -201. -165. -135. -108.
  -88.  -77.  -83. -107. -139. -163. -160. -132.  -92.  -65.  -62.  -78.
  -96.  -97.  -80.  -52.  -27.   -7.   -1.   -8.  -26.  -57.  -93. -118.
 -123. -110.  -86.  -64.  -57.  -63.  -79.  -91.  -88.  -60.  -11.   51.
  104.  131.  123.   92.   62.   60.   97.  160.  220.  250.  242.  214.
  199.  213.  255.  300.  326.  319.  290.  254.  227.  211.  201.  188.
  162.  124.   77.   34.    3.   -7.   -1.   16.   27.   22.    4.  -10.
   -3.   32.   79.  117.  133.  137.  156.  207.  286.  369.  429.  454.
  462.  475.  505.  551.  598.  626.  621.  581.  522.  459.  410.  376.
  350.  317.  267.  206.  145.  100.   76.   67.   62.   53.   40.   38.
   52.

# Combined Message Chunks

In [369]:
from silero_vad import SileroVAD

combined_message_chunks=[]

# Initialize SileroVAD
onnx_model = './silero_vad.onnx'
vad = SileroVAD(onnx_model=onnx_model)

In [370]:
combined_message_chunks

[]

# Main Logic using SileroVAD

In [371]:
import numpy as np
import soundfile as sf
import numpy as np
import tempfile
import os

# buffer to store combined 20 audio-chunks as one chunk
buffer=[]

# buffer index
i=0

# dictionary to store if buffer index has speech
hasSpeech={}

while len(audio_buffer)!=0:
    if len(combined_message_chunks) < 20:
        combined_message_chunks.append(audio_buffer.pop(0))
        
    if len(combined_message_chunks) ==20:
        
        combined_message = b''.join(combined_message_chunks)
        buffer.append(combined_message)

        combined_message_chunks.clear()
        audio_array=np.frombuffer(combined_message, dtype=np.int16)

        # Create tmppath as vad expects wav path 
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmpfile:
            sf.write(tmpfile.name,audio_array, RATE)
            tmpfile_path = tmpfile.name

        speech_timestamps = vad.get_speech_timestamps(tmpfile_path)
        hasSpeech[i] = any(speech_timestamps)
        # for speech in speech_timestamps:
        #     if speech:
        #         print(i,'contains speech', speech)

        # increate buffer index
        i=i+1

# \x02\x00

In [372]:
hasSpeech

{0: False,
 1: True,
 2: True,
 3: True,
 4: True,
 5: False,
 6: False,
 7: True,
 8: True,
 9: False,
 10: True,
 11: False,
 12: False,
 13: False,
 14: False,
 15: False,
 16: True,
 17: True,
 18: False,
 19: False,
 20: False}

# Reconstruct Audio

In [389]:
true_indexes = [k for k, v in hasSpeech.items() if v]
speech_segments=[]

for i in true_indexes:
    audio_array = np.frombuffer(buffer[i], dtype=np.int16)
    speech_segments.append(audio_array)

sf.write('fullaudio.wav',np.concatenate(speech_segments),44100)

In [319]:
len(buffer)

21

----------------------------------------------

In [118]:
import numpy as np

# Assuming your byte string is stored in audio_buffer
np.frombuffer(audio_data_bytes, dtype=np.int16)

array([ 0,  0, -1, ...,  4,  3,  3], dtype=int16)

In [119]:
len(np.frombuffer(audio_data_bytes, dtype=np.int16))

212992

# Check if speech is there in each chunk

In [51]:
from silero_vad import SileroVAD
import soundfile as sf
import numpy as np
import tempfile
import os

# Hardcode RATE
RATE=44100

# Initialize SileroVAD
onnx_model = './silero_vad.onnx'
vad = SileroVAD(onnx_model=onnx_model)

speech_segments = []

# Total number of samples processed so far
total_samples = 0

for i in range(len(audio_buffer)):

    # audio_array = np.frombuffer(audio_buffer[i], dtype=np.int16)
    audio_array = np.frombuffer(audio_data_bytes, dtype=np.int16)

    # Create a temporary WAV file to store the audio data
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmpfile:
        sf.write(tmpfile.name,audio_array, RATE)
        tmpfile_path = tmpfile.name

    # Get speech timestamps for the temporary WAV file
    speech_timestamps = vad.get_speech_timestamps(tmpfile_path)

    for speech in speech_timestamps:
        # Adjust the timestamps relative to the entire audio stream
        # start = speech['start'] 
        # end = speech['end'] 
        # print('chunk', i, 'contains speech', {'start': start, 'end': end})
        # speech_segments.append(audio_array[speech['start']:speech['end']])
        print(speech)
    total_samples += len(audio_array)

    # Clean up the temporary file
    os.unlink(tmpfile_path)

{'start': 29632} []
{'end': 48704} []
{'start': 65472} []

{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 119863}
{'start': 25489, 'end': 73294}
{'start': 94638, 'end': 

KeyboardInterrupt: 

In [201]:
speech_segments

[]

In [199]:
sf.write('test.wav',np.concatenate(speech_segments),44100)

ValueError: need at least one array to concatenate

# Apply VAD to entire audio array

In [197]:
# Initialize SileroVAD
onnx_model = '../using-fastapi/silero-vad-basics/silero_vad.onnx'
vad = SileroVAD(onnx_model=onnx_model)

speech_segments = []

with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmpfile:
    audio_array = np.frombuffer(audio_data_bytes, dtype=np.int16)
    sf.write(tmpfile.name,audio_array, RATE)
    tmpfile_path = tmpfile.name

# Get speech timestamps for the temporary WAV file
speech_timestamps = vad.get_speech_timestamps(tmpfile_path)
for speech in speech_timestamps:
    print('contains speech', speech)
    speech_segments.append(audio_array[speech['start']:speech['end']])

sf.write('test.wav',np.concatenate(speech_segments),44100)

contains speech {'start': 55125, 'end': 74705}
contains speech {'start': 91816, 'end': 115630}
contains speech {'start': 138385, 'end': 159377}
contains speech {'start': 183544, 'end': 205947}
contains speech {'start': 232936, 'end': 255339}
contains speech {'start': 276683, 'end': 301908}
contains speech {'start': 330309, 'end': 355534}
contains speech {'start': 417803, 'end': 438272}
