# REAL TIME AUDIO STREAM PROCESSING

# Feature Extraction

In [2]:
from transformers import AutoFeatureExtractor

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

In [None]:
import sounddevice as sd
import numpy as np

# Parameters
duration = 2  # seconds
sampling_rate = 16000

# Record audio
print("Recording...")
audio_data = sd.rec(int(duration * sampling_rate), samplerate=sampling_rate, channels=1, dtype='float32')
sd.wait()  # Wait until recording is finished
print("Recording finished.")

# Convert to numpy array and squeeze to remove single-dimensional entries
audio_data = np.squeeze(audio_data)
print(f"Recorded audio shape: {audio_data.shape}")


In [9]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [None]:
preprocess_function([{"array": audio_data}])

In [None]:
audio_data

# Pretrained Models

In [None]:
from transformers import Wav2Vec2ForSequenceClassification

# Load the pretrained model
model_name_or_path = "7wolf/wav2vec2-base-gender-classification"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name_or_path)

print(f"Loaded model: {model_name_or_path}")


In [None]:
# Check the model's expected input format
print(f"Model expected input names: {model.forward.__code__.co_varnames}")
print(f"Model expected input types: {model.forward.__annotations__}")


In [None]:
model(audio_data)

In [None]:
from transformers import Wav2Vec2Processor

# Load the processor for the specific model
processor = Wav2Vec2Processor.from_pretrained("7wolf/wav2vec2-base-gender-classification")

# Preprocess the audio data
inputs = processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

# Move inputs to the same device as the model
inputs = {key: value.to(model.device) for key, value in inputs.items()}

# Run inference
with torch.no_grad():
    logits = model(**inputs).logits

# Get the predicted label
predicted_ids = torch.argmax(logits, dim=-1)
predicted_label = [model.config.id2label[id.item()] for id in predicted_ids]

print(f"Predicted label: {predicted_label}")


In [1]:
import torch
import numpy as np
import sounddevice as sd
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

def record_audio(duration: float = 2.0, sampling_rate: int = 16000) -> np.ndarray:
    """
    Record audio from the microphone for a given duration.
    Returns a 1D numpy array of the recorded audio.
    """
    print("Recording audio...")
    num_samples = int(duration * sampling_rate)
    audio = sd.rec(num_samples, samplerate=sampling_rate, channels=1, dtype="float32")
    sd.wait()
    print("Recording complete.")
    return audio.squeeze()

def play_audio(audio: np.ndarray, sampling_rate: int = 16000):
    """
    Play audio using the sounddevice library.
    """
    print("Playing back audio...")
    sd.play(audio, samplerate=sampling_rate)
    sd.wait()
    print("Playback complete.")

def run_inference(model, feature_extractor, audio: np.ndarray, sampling_rate: int = 16000):
    """
    Preprocess the audio, run the model inference, and return the model's logits.
    """
    inputs = feature_extractor(audio, sampling_rate=sampling_rate, return_tensors="pt", padding="longest")
    with torch.no_grad():
        outputs = model(**inputs)
        print(outputs)
    return outputs.logits

if __name__ == "__main__":
    #model_name_or_path = "alefiury/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
    model_name_or_path = "7wolf/wav2vec2-base-gender-classification"
    
    # Load the feature extractor and model
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
    model = AutoModelForAudioClassification.from_pretrained(model_name_or_path)
    
    # Print the audio input device name
    device_info = sd.query_devices(None, "input")
    print("Using audio input device:", device_info["name"])
    
    # Record audio, play it back, and run inference
    audio_data = record_audio(duration=0.1, sampling_rate=16000)
    play_audio(audio_data, sampling_rate=16000)
    logits = run_inference(model, feature_extractor, audio_data, sampling_rate=16000)
    
    # Display the result
    predicted_class = torch.argmax(logits, dim=-1).item()
    print("Predicted class:", predicted_class)


Using audio input device: MacBook Pro Microphone
Recording audio...
Recording complete.
Playing back audio...
Playback complete.
SequenceClassifierOutput(loss=None, logits=tensor([[ 2.2427, -2.6925]]), hidden_states=None, attentions=None)
Predicted class: 0


In [None]:
import time
import numpy as np

# Generate a random audio sample of 200ms (0.2 seconds) duration
duration_benchmark = 0.1  # seconds
sampling_rate = 16000
num_samples_benchmark = int(duration_benchmark * sampling_rate)
audio_benchmark = np.random.randn(num_samples_benchmark).astype("float32")

# Warm up the model (a single inference to avoid one-time overheads)
_ = run_inference(model, feature_extractor, audio_benchmark, sampling_rate=sampling_rate)

# Benchmark the inference time over multiple runs
num_runs = 100
total_time = 0.0
for _ in range(num_runs):
    start_time = time.time()
    _ = run_inference(model, feature_extractor, audio_benchmark, sampling_rate=sampling_rate)
    total_time += time.time() - start_time

avg_time_ms = (total_time / num_runs) * 1000
print(f"[Benchmark] Average inference time over {num_runs} runs: {avg_time_ms:.2f} ms")


In [None]:
playback_info = sd.query_devices(None, "output")
print("Using audio playback device:", playback_info["name"])
input_info = sd.query_devices(None, "input")
print("Using audio input device:", input_info["name"])


In [None]:
import numpy as np
# Patch numpy to avoid the "_no_nep50_warning" attribute error
if not hasattr(np, '_no_nep50_warning'):
    np._no_nep50_warning = lambda: None

import sounddevice as sd
import librosa
import queue
import threading
import time  # Needed for our sleep loop
import torch  # Required for inference

# === CONFIGURATION ===
SAMPLE_RATE = 16000  # 16kHz sample rate (common for speech processing)
BUFFER_DURATION = 2.0  # Circular buffer stores 2 seconds of audio
CHUNK_DURATION = 0.5  # Process 500ms chunks
OVERLAP = 0.5  # 50% overlap
FRAME_SIZE = int(SAMPLE_RATE * CHUNK_DURATION)  # CHUNK_DURATION worth of samples
HOP_SIZE = int(FRAME_SIZE * (1 - OVERLAP))  # Step size based on overlap
CONFIDENCE_THRESHOLD = 0.8  # Only display logits if the prediction confidence meets this threshold
AMPLITUDE_THRESHOLD = 0.01  # Minimum RMS amplitude required to run inference

# === CIRCULAR BUFFER ===
BUFFER_SIZE = int(SAMPLE_RATE * BUFFER_DURATION)  # Total samples in buffer
circular_buffer = np.zeros(BUFFER_SIZE, dtype=np.float32)
write_index = 0
lock = threading.Lock()

# === QUEUE FOR PROCESSING (store both a buffer snapshot and its write index) ===
audio_queue = queue.Queue()

# === STOP EVENT FOR CLEAN SHUTDOWN ===
stop_event = threading.Event()

def audio_callback(indata, frames, time_info, status):
    """Callback function for real-time audio capture."""
    global circular_buffer, write_index

    if status:
        print("Error:", status)

    with lock:
        num_samples = indata.shape[0]
        end_index = (write_index + num_samples) % BUFFER_SIZE
        if end_index > write_index:
            circular_buffer[write_index:end_index] = indata[:, 0]
        else:
            circular_buffer[write_index:] = indata[:BUFFER_SIZE - write_index, 0]
            circular_buffer[:end_index] = indata[BUFFER_SIZE - write_index:, 0]
        write_index = end_index
        # Take a snapshot of the current buffer along with the latest write index.
        buffer_snapshot = circular_buffer.copy()
        current_index = write_index

    audio_queue.put((buffer_snapshot, current_index))

def process_audio():
    """Background thread for processing audio chunks and performing inference."""
    while not stop_event.is_set():
        try:
            # Wait for new audio data along with its associated write index
            buffer_data, current_index = audio_queue.get(timeout=0.5)
        except queue.Empty:
            continue

        # Compute the latest FRAME_SIZE samples using circular buffer logic.
        if current_index >= FRAME_SIZE:
            chunk = buffer_data[current_index - FRAME_SIZE:current_index]
        else:
            # Wrap around if the latest samples span the end and beginning of the buffer.
            chunk = np.concatenate((
                buffer_data[BUFFER_SIZE - (FRAME_SIZE - current_index):],
                buffer_data[:current_index]
            ))
        
        # Compute RMS amplitude to determine if the chunk contains significant audio signal.
        rms_amplitude = np.sqrt(np.mean(chunk**2))
        if rms_amplitude < AMPLITUDE_THRESHOLD:
            print(f"Detected noise (RMS amplitude: {rms_amplitude:.4f}), skipping inference.")
            continue

        logits = run_inference(model, feature_extractor, chunk, sampling_rate=SAMPLE_RATE)
        predicted_class = torch.argmax(logits, dim=-1).item()
        probs = torch.softmax(logits, dim=-1)
        confidence = probs[0, predicted_class].item() if probs.ndim > 1 else probs[predicted_class].item()
        if confidence >= CONFIDENCE_THRESHOLD:
            print("Logits:", logits)
        print("Predicted class:", predicted_class, "with confidence:", confidence)

# === START AUDIO STREAM AND PROCESSING THREAD ===
stream = sd.InputStream(
    samplerate=SAMPLE_RATE, 
    channels=1, 
    callback=audio_callback, 
    blocksize=HOP_SIZE
)
processing_thread = threading.Thread(target=process_audio)
processing_thread.start()

print("Starting real-time audio inference... (Press Ctrl+C to stop)")
try:
    with stream:
        # Keep the stream open and let the callback/processing thread run continuously.
        while True:
            time.sleep(0.1)
except KeyboardInterrupt:
    print("Stopping real-time audio inference...")

# Signal the processing thread to stop and wait for it to finish
stop_event.set()
processing_thread.join()
