In [11]:
import pyaudio

p = pyaudio.PyAudio()
info = p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')

for i in range(0, numdevices):
    if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
        print("Input Device id ", i, " - ", p.get_device_info_by_host_api_device_index(0, i).get('name'))

Input Device id  2  -  USB AUDIO  CODEC
Input Device id  3  -  MacBook Air Microphone
Input Device id  5  -  krisp microphone


In [32]:
import librosa
import numpy as np

# Initialize audio input
in_stream = pyaudio.PyAudio().open(
    format=pyaudio.paFloat32,
    channels=1,
    rate=44100,
    input=True,
    input_device_index=2,
    frames_per_buffer=1024,
)

# Initialize audio output
out_stream = pyaudio.PyAudio().open(
    format=pyaudio.paFloat32,
    channels=1,
    rate=44100,
    output=True,
    frames_per_buffer=1024,
)

# buffer = np.array([])
# while True:
#     # Read audio data from the microphone
#     audio_data = np.frombuffer(in_stream.read(1024, exception_on_overflow=False), dtype=np.float32)
#     buffer = np.concatenate((buffer, audio_data))

#     # Detect onsets in the audio data
#     onsets = librosa.onset.onset_detect(y=buffer, sr=44100, units='samples')

#     if len(onsets) > 1:
#         # Extract the segment with relevant sound
#         start, end = onsets[:2]
#         sound_segment = buffer[start:end]

#         # Play the sound segment
#         out_stream.write(sound_segment.tobytes())

#         # Remove the processed segment from the buffer
#         buffer = buffer[end:]

In [200]:
# grab a couple seconds of audio
audio_data = in_stream.read(44100 * 6, exception_on_overflow=False)

In [201]:
# play the audio data
out_stream.write(audio_data)

In [237]:
# detect onsets in the audio data
audio_np = np.frombuffer(audio_data, dtype=np.float32)
audio_np = audio_np / np.max(np.abs(audio_np)) # normalize the audio data

onset_env = librosa.onset.onset_strength(
    y=audio_np,
    sr=44100,
    hop_length=512,
)

onsets = librosa.onset.onset_detect(
    y=audio_np,
    sr=44100,
    onset_envelope=onset_env,
    units='samples',
)

onsets

array([ 24576,  38912,  54272,  68096,  69632,  83456,  91136,  96768,
       111104, 124416, 125952, 140800, 155648, 170496, 179712, 181248,
       183808, 185344, 200704, 215552, 229376, 243200, 244736, 260096])

In [274]:
import time

chunks = []
last_onset = 0

for onset in onsets:
    # if this onset is too close (6000 samples) to the last one, skip it)
    if onset < last_onset + 6000:
        continue
    last_onset = onset
    start, end = onset, onset + 6000

    print(start, end, end - start)

    out_stream.write(audio_np[start:end].tobytes())

    chunks.append(audio_np[start:end])

    time.sleep(0.5)

24576 30576 6000
38912 44912 6000
54272 60272 6000
68096 74096 6000
83456 89456 6000
91136 97136 6000
111104 117104 6000
124416 130416 6000
140800 146800 6000
155648 161648 6000
170496 176496 6000
179712 185712 6000
200704 206704 6000
215552 221552 6000
229376 235376 6000
243200 249200 6000
260096 266096 6000


In [275]:
# create dataframe of a 6000 sample window around each onset
import pandas as pd

# Create an empty DataFrame with columns numbered from 0 to 5999
column_names = [str(i) for i in range(6000)]
df = pd.DataFrame(columns=column_names)

# Append each audio sample as a row in the DataFrame
for audio_sample in chunks:
    # pad sample with zeros if it's shorter than 6000 samples
    if len(audio_sample) < 6000:
        audio_sample = np.pad(audio_sample, (0, 6000 - len(audio_sample)), 'constant')

    df = pd.concat([df, pd.DataFrame(audio_sample.reshape(1, -1), columns=column_names)], ignore_index=True)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,-0.014132,-0.01664,-0.035371,-0.04181,-0.028681,-0.009365,-0.01664,-0.020069,-0.02241,-0.011874,...,-0.00577,-0.006021,-0.006439,-0.006355,-0.006271,-0.005853,-0.005602,-0.005853,-0.006104,-0.006104
1,0.004265,0.010369,0.000502,-0.00485,-0.009783,-0.002509,0.011539,0.005519,0.001756,0.010954,...,0.005268,0.005435,0.005435,0.005268,0.005184,0.005101,0.005017,0.005017,0.005017,0.004515
2,0.018982,0.00577,0.022912,0.015971,0.024835,0.044903,0.025587,0.007108,0.002676,0.015051,...,-0.009951,-0.009616,-0.009031,-0.008613,-0.008613,-0.008947,-0.009198,-0.009282,-0.009198,-0.009282
3,0.005602,-0.005686,0.008278,0.023581,-0.014717,-0.008195,0.012459,0.01455,-0.010369,-0.001422,...,0.014132,0.013881,0.013964,0.014299,0.013546,0.013212,0.014132,0.01455,0.013964,0.013463
4,-0.014968,-0.002759,-0.016473,-0.043147,-0.052847,-0.030939,-0.048917,-0.081361,-0.076846,-0.018982,...,0.007024,0.006857,0.00694,0.006857,0.006773,0.006606,0.006522,0.006439,0.006355,0.006271
5,-0.007693,-0.007777,-0.007777,-0.007777,-0.007442,-0.007442,-0.007358,-0.006857,-0.006606,-0.006606,...,0.002007,-0.006606,-0.018898,0.020319,0.029434,0.012543,-0.013463,-0.033113,-0.047997,0.01062
6,0.311815,0.475458,0.608245,0.54829,0.431307,0.380467,0.399448,0.443432,0.450707,0.399448,...,0.012292,0.011958,0.011874,0.011623,0.011539,0.011372,0.011121,0.010452,0.010118,0.0097
7,0.004934,-0.013212,-0.022494,-0.015219,-0.012877,0.021072,0.000669,-0.03211,-0.006439,0.006188,...,-0.005519,-0.004432,-0.003178,-0.002425,-0.002007,-0.001672,-0.001087,-0.001254,-0.001756,-0.002174
8,-0.037879,-0.054436,-0.061627,-0.048583,-0.071662,-0.066477,-0.077431,-0.107367,-0.093152,-0.07693,...,0.003428,0.003345,0.003512,0.003345,0.003679,0.003763,0.003846,0.003763,0.003763,0.003679
9,0.010787,0.008864,0.043984,0.005519,-0.011121,0.009533,0.010369,-0.011958,-0.023748,-0.008864,...,0.004683,0.004348,0.004348,0.004348,0.004515,0.004766,0.004766,0.004934,0.004934,0.004934


In [276]:
# loop through each row, play it, and wait for the user to enter a label and save into y
y = []
for i, row in df.iterrows():
    out_stream.write(row.to_numpy().tobytes())
    label = input('Enter label: ')
    y.append(label)

In [283]:
y

['clap',
 'clap',
 'kick',
 'hat',
 'clap',
 'clap',
 'kick',
 'hat',
 'clap',
 'clap',
 'kick',
 'hat',
 'clap',
 'clap',
 'kick',
 'hat',
 'clap']

In [359]:
# grab a new audio sample
test_audio = in_stream.read(44100, exception_on_overflow=False)

In [360]:
out_stream.write(test_audio)

In [365]:
test_np = np.frombuffer(test_audio, dtype=np.float32)
test_np = test_np / np.max(np.abs(test_np)) # normalize the audio data
test_onset_env = librosa.onset.onset_strength(
    y=test_np,
    sr=44100,
    hop_length=512,
)
test_onsets = librosa.onset.onset_detect(
    y=test_np,
    sr=44100,
    onset_envelope=test_onset_env,
    units='samples',
)

last_onset = 0
test_chunk = np.array(np.zeros(6000))

for onset in test_onsets:
    if onset < last_onset + 6000:
        continue
    last_onset = onset
    start, end = onset, onset + 6000
    print(start, end, end - start)
    test_chunk = test_np[start:end]
    out_stream.write(test_np[start:end].tobytes())
    break

test_df = pd.DataFrame(test_chunk.reshape(1, -1), columns=column_names)
test_df

31744 37744 6000


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,-0.503356,-0.237481,0.083118,0.266908,0.451471,0.6381,0.663139,0.575116,0.513681,0.427207,...,-0.003872,-0.005679,-0.006711,-0.007486,-0.008518,-0.009551,-0.010325,-0.010067,-0.010325,-0.009293


In [366]:
# lets try a clustering algorithm
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=0).fit(df)



In [375]:
# play one of the samples from each cluster
for cluster in range(3):
    cluster_df = df[kmeans.labels_ == cluster]
    cluster_sample = cluster_df.sample(1)
    out_stream.write(cluster_sample.to_numpy().tobytes())
    time.sleep(0.5)

In [391]:
librosa.feature.mfcc(y=df.iloc[0].to_numpy(), sr=44100, n_mfcc=13)

array([[-2.0890125e+02, -1.8886427e+02, -2.1768362e+02, -2.8549487e+02,
        -3.6066760e+02, -4.2946552e+02, -4.5941989e+02, -4.8026559e+02,
        -5.0236172e+02, -5.2347742e+02, -5.3506226e+02, -5.2888873e+02],
       [ 6.9580757e+01,  6.0988480e+01,  5.0151752e+01,  4.1276825e+01,
         5.9350555e+01,  8.5005829e+01,  9.5390892e+01,  9.7882904e+01,
         1.0368300e+02,  1.0479058e+02,  1.1122952e+02,  1.1373392e+02],
       [-1.4707133e+01, -1.1972187e+01, -9.8092022e+00, -1.0133264e+01,
        -1.9207752e+01, -1.3033042e+01, -1.3580223e+01, -2.1235422e+01,
        -2.0093237e+01, -1.6270098e+01, -1.0193535e+01, -3.1935809e+00],
       [ 1.0354469e+01,  1.7755112e+01,  2.3915045e+01,  3.3184635e+01,
         3.6182087e+01,  3.4423630e+01,  3.4101234e+01,  3.6074306e+01,
         3.6728203e+01,  3.2951958e+01,  3.1922642e+01,  2.5566099e+01],
       [ 4.7250252e+00,  8.0277901e+00,  1.1186962e+01,  1.1712753e+01,
         1.4068144e+01,  1.0074052e+01,  8.0269117e+00,  1.2

In [389]:
# that didn't work, lets try to get the mfccs of each sample and use those to cluster
df_mfccs = pd.DataFrame(columns=[str(i) for i in range(13)])
for i, row in df.iterrows():
    mfccs = librosa.feature.mfcc(y=row.to_numpy(), sr=44100, n_mfcc=13)
    mfccs_df
    df_mfccs = pd.concat([df_mfccs, pd.DataFrame(mfccs.T)], ignore_index=True)

df_mfccs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3.1,4.1,5.1,6.1,7.1,8.1,9.1,10,11,12
0,,,,,,,,,,,...,10.354469,4.725025,12.854246,-12.373757,-13.363389,9.854835,-3.442081,-14.214624,-11.098188,-13.376860
1,,,,,,,,,,,...,17.755112,8.027790,18.986023,-6.721361,-8.503116,9.855391,-2.134540,-16.018963,-10.807072,-17.301935
2,,,,,,,,,,,...,23.915045,11.186962,24.590078,-4.044400,-5.281628,6.509623,3.056053,-15.283569,-10.337276,-23.587456
3,,,,,,,,,,,...,33.184635,11.712753,19.882694,-10.285540,-11.183927,0.715391,8.121153,-8.682934,-7.072352,-19.959036
4,,,,,,,,,,,...,36.182087,14.068144,24.853069,-6.975927,-5.892183,0.880104,9.771854,-1.832492,-4.466112,-19.178562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,,,,,,,,,,,...,11.428398,20.846687,33.852783,3.932690,10.350405,13.050755,10.240345,1.975070,-0.415487,-20.393448
200,,,,,,,,,,,...,16.511400,16.022650,18.945278,2.765263,7.811350,12.114024,3.530072,7.210748,-2.022112,-0.059123
201,,,,,,,,,,,...,18.425020,9.443244,10.463894,4.989586,10.017050,11.343133,5.163934,7.058941,1.100207,3.528537
202,,,,,,,,,,,...,17.343090,5.573296,8.483372,6.623830,10.542232,7.837983,4.334030,7.123930,1.244357,1.827660
