In [None]:
import numpy as np
import scipy.signal as signal
import torchaudio
from IPython.display import Audio
import torch
from matplotlib import pyplot as plt
import soundfile as sf
import time



In [None]:
# THIS NOTEBOOK: 

# Sometimes we need to crop the signal making sure that there is no silece in the cropped audio. 
# To do that we can compute signal energy and decide based on that. 
# One way of computing signal energy is by computing convolution of a squared signal with a flat kernel. 
# It is equivalent to a moving average filter, where the convolution kernel size defines the length of the
# averaging window. However, for long signals convolution is quite expensive, and for cropping audio we 
# can use a less precise approach. So below, I am testing how to use strided convolution in pytorch to 
# pick a window with more or less the highest energy. This is equivalent to computing moving average with 
# a certain step-size. 

In [None]:
def convolve_torchaudio(sig, ir):
    sig_tensor = torch.tensor(sig, dtype=torch.float32)
    ir_tensor = torch.tensor(ir, dtype=torch.float32)
    return torchaudio.functional.convolve(sig_tensor, ir_tensor)

def convolve_torch2(sig, ir,device="cpu",stride=1000):
    # sig - audio signal to be convolved (in conv1d terms - input tensor)
    # ir - impulse response to colvolve the signal with (in conv1d terms - convolution kernel)

    # torch.nn.Conv1d performs autocorrelation operation. To make it the actual convolution, the 
    # kernel has to be flipped. We have to make a copy of the flipped variable, otherwise we will get the 
    # error saying that the stride in numpy array is negative, and tensors with negative strides are not supported. 
    ir_flipped=ir[::-1].copy()
    # prepare the shape of the sig and ir so that it matches what Conv1d expects:
    sig_tensor = torch.tensor(sig, dtype=torch.float32).unsqueeze(0).unsqueeze(0)  # Shape: (batch_size, in_channels, input_length)
    ir_tensor = torch.tensor(ir_flipped.copy(), dtype=torch.float32).unsqueeze(0).unsqueeze(0)  # Shape: (batch_size, in_channels, input_length)
    # create padding that allows for obtaining a "full" convolution, where the size of the 
    # convolved output signal is N+M-1 , where N=len(sig), M=len(ir)
    full_padding = len(ir_tensor[0][0]) - 1
    # define convolutional layer:
    conv_layer = torch.nn.Conv1d(in_channels=1, out_channels=1, kernel_size=ir_tensor.shape[2], stride=stride, padding=full_padding, bias=False)
    # move data to device
    sig_tensor=sig_tensor.to(device)
    ir_tensor=ir_tensor.to(device)
    # Set the weights of the convolutional layer (kernel) to be the flipped impulse response:
    with torch.no_grad():
        conv_layer.weight.data = ir_tensor
        
    return conv_layer(sig_tensor)


In [None]:
# load audio:
audio, fs = sf.read('audios/speech_VCTK_4_sentences.wav')
audio=audio[0:5*fs]
# (D,) -> (1xD)
audio=np.reshape(audio, (1, -1))

# create convolutional kernel consisting equal weights
# convolution with a flat kernel correspond to a moving average
window_lengths=[0.2, 0.5,1,3]
for L in window_lengths:
    kernel=  torch.ones(1,round(L*fs))/round(L*fs)
    # convolve
    start=time.time()
    envelope = convolve_torchaudio(torch.tensor(audio**2),kernel)
    
    end=time.time()
    print("for windowlength "+str(L)+ "the convolution time is: " +str(end-start))

    plt.figure(figsize=(5,1))
    plt.subplot(1,2,1);plt.plot(audio.T);plt.title("audio signal")
    plt.subplot(1,2,2);plt.plot(envelope.T);plt.title("envelope")
    plt.tight_layout()
    plt.show()


In [None]:
# Actually, for envelope extraction we don't need to know the moving average for every audio sample, 
# we can check the segments energy with a higher step size (stride). To do that we can use the 
# pytorch 1d convolution. From previous analysis I know that the fastest pytorch convolution, which 
# allows to specify the stride is the nn.Conv1d function, so I can use it here: 

# load audio:
audio, fs = sf.read('audios/speech_VCTK_4_sentences.wav')
audio=audio[0:5*fs]

# create convolutional kernel consisting equal weights
# convolution with a flat kernel correspond to a moving average

step_len=1 # averaging step-size in seconds
win_len= 2 # averaging window in seconds
stride=round(step_len*fs)
kernel= np.ones((round(win_len*fs),))/kernel.shape[0]
kernel_size=kernel.shape[0]

# convolve with a specific stride:
start=time.time()
sparse_envelope = convolve_torch2(audio**2,kernel,stride=stride).detach().numpy().squeeze(0).squeeze(0)
audio_padded=np.concatenate((np.zeros(kernel_size-1),audio, np.zeros(kernel_size-1)))
envelope_padded=np.zeros(audio_padded.shape)
envelope_padded[0:-kernel.shape[0]:stride]=sparse_envelope
stop=time.time()
print("Execution time for convolution:"+ str(stop-start))

# compute moving average:
start=time.time()
envelope_padded_check=np.zeros(audio_padded.shape)
for i in range(0,envelope_padded_check.shape[0]-kernel.shape[0],stride):
    envelope_padded_check[i]=np.mean(audio_padded[i:i+kernel.shape[0]]**2)
stop=time.time()
print("Execution time for moving average:"+ str(stop-start))


# check if the result of the convolution is equivalent to 
plt.figure(figsize=(10,5))
plt.subplot(3,1,1);plt.plot(audio_padded);plt.title("audio signal")
plt.subplot(3,1,2);plt.plot(envelope_padded,color="red"),plt.title("energy computed using Conv1d with stride")
plt.subplot(3,1,3);plt.plot(envelope_padded_check,color="red"),plt.title("energy computed using moving average window")
plt.tight_layout()
plt.show()


In [None]:
# ------- AUDIO ENERGY-BASED CROPPING FUNCTION USING MOVING AVERAGE -------
# Below a function that computes moving average of a tensor in a for loop, 
# with a given window lenght and step size, and later picks the window with
# the highest energy

# load audio:
audio, fs = sf.read('audios/speech_VCTK_4_sentences.wav')
audio=audio[0:3*fs]
# desired cropped audio lenght in seconds:
L_win_sec=2
# stepsize for moving average
stride_s=L_win_sec/6

# np -> pytorch
audio_torch=torch.tensor(audio).unsqueeze(0).unsqueeze(0)

def ma_based_crop_torch(audio,fs,L_win_sec,stride_s):
    L_audio_smpl=audio.shape[2] # Shape: (batch_size, in_channels, input_length)
    L_win_smpl=int(L_win_sec*fs)
    stride_smpl=int(stride_s*fs)
    energy=torch.zeros(L_audio_smpl)
    for i in range(0,L_audio_smpl-(L_win_smpl-1),stride_smpl):
        energy[i]=torch.mean(audio[:,:,i:i+L_win_smpl]**2)
    # find the window with the highest energy
    idx_start=int(torch.argmax(energy))
    idx_end=idx_start+L_win_smpl
    audio_crop=audio[:,:,idx_start:idx_end]
    return audio_crop, energy, idx_start, idx_end

start=time.time()
audio_crop_torch, energy, idx_start, idx_end=ma_based_crop_torch(audio_torch,fs,L_win_sec,stride_s)
stop=time.time()
print("Execution time for pytorch moving average audio cropping function:"+ str(stop-start))

audio_crop=audio_crop_torch[0,0,:].numpy()
audio_crop_padded=np.zeros(audio.shape)
audio_crop_padded[idx_start:idx_end]=audio_crop

plt.figure(figsize=(10,3))
plt.subplot(3,1,1);plt.plot(audio);plt.title("audio")
plt.subplot(3,1,2);plt.plot(energy);plt.title("moving average without padding - energy estimate")
plt.subplot(3,1,3);plt.plot(audio_crop_padded);plt.title("audio cropped based on energy")
plt.tight_layout()



In [None]:
# ------- AUDIO ENERGY-BASED CROPPING FUNCTION USING CONVOLUTION -------
# Below a function that computes moving average of a tensor using strided convolution, 
# with a given window lenght (kernel size) and step size (stride), and later picks the 
# window with the highest energy

# load audio:
audio, fs = sf.read('audios/speech_VCTK_4_sentences.wav')
audio=audio[0:3*fs]
# desired cropped audio lenght in seconds:
L_win_sec=2
# stepsize for moving average
stride_s=L_win_sec/6

# np -> pytorch
audio_torch=torch.tensor(audio, dtype=torch.float32).unsqueeze(0).unsqueeze(0)

def conv_based_crop_torch(audio,fs,L_win_sec,stride_s):
    L_audio_smpl=audio.shape[2] # Shape: (batch_size, in_channels, input_length)
    L_win_smpl=int(L_win_sec*fs)
    stride_smpl=int(stride_s*fs)
    kernel= torch.ones((L_win_smpl,),dtype=torch.float32)/L_win_smpl
    kernel=kernel.unsqueeze(0).unsqueeze(0)
    conv_layer=torch.nn.Conv1d(in_channels=1, out_channels=1, kernel_size=L_win_smpl, stride=stride_smpl, bias=False)
    with torch.no_grad():
        conv_layer.weight.data = kernel   
    out_layer=conv_layer(audio**2)
    energy=torch.zeros(L_audio_smpl)
    energy[0:-(L_win_smpl-1):stride_smpl]=out_layer[0,0,:].detach()
    # find the window with the highest energy
    idx_start=int(torch.argmax(energy))
    idx_end=idx_start+L_win_smpl
    audio_crop=audio[:,:,idx_start:idx_end]
    return audio_crop, energy, idx_start, idx_end

start=time.time()
audio_crop_torch, energy, idx_start, idx_end=conv_based_crop_torch(audio_torch,fs,L_win_sec,stride_s)
stop=time.time()
print("Execution time for pytorch moving average audio cropping function:"+ str(stop-start))

audio_crop=audio_crop_torch[0,0,:].numpy()
audio_crop_padded=np.zeros(audio.shape)
audio_crop_padded[idx_start:idx_end]=audio_crop

plt.figure(figsize=(10,3))
plt.subplot(3,1,1);plt.plot(audio);plt.title("audio")
plt.subplot(3,1,2);plt.plot(energy);plt.title("convolution without padding - energy estimate")
plt.subplot(3,1,3);plt.plot(audio_crop_padded);plt.title("audio cropped based on energy")
plt.tight_layout()