In [3]:
import numpy as np
import wave

In [4]:
def get_channel_activity(filename, frame_duration_ms=500, noise_threshold=5000, volume_gain=0.1):
    """
    Analyze audio file to determine channel activity (patient or clinician).

    Args:
        filename (str): Path to the .wav file.
        frame_duration_ms (int): Duration of each frame in milliseconds.
        noise_threshold (int): Threshold to consider a frame as noise.
        volume_gain (float): Gain to apply to the audio signal.

    Returns:
        list: A list of timestamps and labels indicating which channel is active.
    """
    with wave.open(filename, 'r') as wf:
        n_channels = wf.getnchannels()
        if n_channels != 2:
            raise ValueError("The function only supports stereo audio files.")
        sample_width = wf.getsampwidth()
        frame_rate = wf.getframerate()
        n_frames = wf.getnframes()
        duration_s = n_frames / frame_rate

        

        frames_per_window = int(frame_rate * frame_duration_ms / 1000)
        activity_timestamps = []

        for i in range(0, n_frames, frames_per_window):
            wf.setpos(i)
            frames = wf.readframes(frames_per_window)

            # Convert the frame data to numpy arrays for left and right channels
            data = np.frombuffer(frames, dtype=np.int16)

            # Apply volume gain
            data = data * volume_gain

            # Split channels
            left_channel = data[0::2]
            right_channel = data[1::2]

            # Calculate the average energy of each channel
            left_energy = np.sum(np.abs(left_channel))
            right_energy = np.sum(np.abs(right_channel))

            # Apply noise threshold
            if left_energy < noise_threshold and right_energy < noise_threshold:
                continue  # Skip this frame as it's likely noise

            # Determine which channel is active
            time_stamp = i / frame_rate
            if left_energy > right_energy:
                activity_timestamps.append([time_stamp, "spk0"])
            elif right_energy > left_energy:
                activity_timestamps.append([time_stamp, "spk1"])
            else:
                activity_timestamps.append([time_stamp, None])

        return activity_timestamps

In [5]:
# filename = "./audio_files/conversation1.wav"
filename = "./audio_files/donald_trump.wav"

result = get_channel_activity(filename, frame_duration_ms=500, noise_threshold=5000, volume_gain=0.1)

In [6]:
len(result), result[:]

(94,
 [[0.0, 'spk0'],
  [0.5, 'spk1'],
  [1.0, 'spk0'],
  [1.5, 'spk0'],
  [2.0, 'spk1'],
  [2.5, 'spk0'],
  [3.0, 'spk0'],
  [3.5, 'spk0'],
  [4.0, 'spk1'],
  [4.5, 'spk0'],
  [5.0, 'spk1'],
  [5.5, 'spk1'],
  [6.0, 'spk0'],
  [6.5, 'spk0'],
  [7.0, 'spk0'],
  [7.5, 'spk0'],
  [8.0, 'spk0'],
  [8.5, 'spk1'],
  [9.0, 'spk0'],
  [9.5, 'spk1'],
  [10.0, 'spk0'],
  [10.5, 'spk1'],
  [11.0, 'spk1'],
  [11.5, 'spk1'],
  [12.0, 'spk1'],
  [12.5, 'spk0'],
  [13.0, 'spk0'],
  [13.5, 'spk0'],
  [14.0, 'spk0'],
  [14.5, 'spk0'],
  [15.0, 'spk1'],
  [15.5, 'spk1'],
  [16.0, 'spk1'],
  [16.5, 'spk0'],
  [17.0, 'spk0'],
  [17.5, 'spk1'],
  [18.0, 'spk0'],
  [18.5, 'spk0'],
  [19.0, 'spk1'],
  [19.5, 'spk0'],
  [20.0, 'spk0'],
  [20.5, 'spk0'],
  [21.0, 'spk0'],
  [21.5, 'spk0'],
  [22.0, 'spk1'],
  [22.5, 'spk0'],
  [23.0, 'spk1'],
  [23.5, 'spk0'],
  [24.0, 'spk0'],
  [24.5, 'spk0'],
  [25.0, 'spk0'],
  [25.5, 'spk0'],
  [26.0, 'spk0'],
  [26.5, 'spk1'],
  [27.0, 'spk1'],
  [27.5, 'spk1'],
  [28.0