Skip to content
Browse files
add complex support to convolve_filter
  • Loading branch information
slarew committed Feb 20, 2020
1 parent 8a969d0 commit f3bc9bb02773324731c59ea811b8ca4c616b0212
Showing 6 changed files with 356 additions and 58 deletions.
@@ -68,3 +68,48 @@ reverb_RR.apply(tmp4, audio[1]);
audio[0] = tmp1 + tmp2;
audio[1] = tmp3 + tmp4;

# Implementation Details

The convolution filter efficiently computes the convolution of two signals.
The efficiency is achieved by employing the FFT and the circular convolution
theorem. The algorithm is a variant of the [overlap-add
method]( It works on
a fixed block size \(B\) for arbitrarily long input signals. Thus, the
convolution of a streaming input signal with a long FIR filter \(h[n]\) (where
the length of \(h[n]\) may exceed the block size \(B\)) is computed with a
fixed complexity \(O(B \log B)\).

More formally, the convolution filter computes \(y[n] = (x * h)[n]\) by
partitioning the input \(x\) and filter \(h\) into blocks and applies the
overlap-add method. Let \(x[n]\) be an input signal of arbitrary length. Often,
\(x[n]\) is a streaming input with unknown length. Let \(h[n]\) be an FIR
filter with \(M\) taps. The convolution filter works on a fixed block size

First, the input and filter are windowed and shifted to the origin to give the
\(k\)-th block input \(x_k[n] = x[n + kB] , n=\{0,1,\ldots,B-1\},\forall
k\in\mathbb{Z}\) and \(j\)-th block filter \(h_j[n] = h[n + jB] ,
n=\{0,1,\ldots,B-1\},j=\{0,1,\ldots,\lfloor M/B \rfloor\}\). The convolution
\(y_{k,j}[n] = (x_k * h_j)[n]\) is efficiently computed with length \(2B\) FFTs
y_{k,j}[n] = \mathrm{IFFT}(\mathrm{FFT}(x_k[n])\cdot\mathrm{FFT}(h_j[n]))

The overlap-add method sums the "overlap" from the previous block with the current block.
To complete the \(k\)-th block, the contribution of all blocks of the filter
are summed together to give
\[ y_{k}[n] = \sum_j y_{k-j,j}[n] . \]
The final convolution is then the sum of the shifted blocks
\[ y[n] = \sum_k y_{k}[n - kB] . \]
Note that \(y_k[n]\) is of length \(2B\) so its second half overlaps and adds
into the first half of the \(y_{k+1}[n]\) block.

## Maximum efficiency criterion

To avoid excess computation or maximize throughput, the convolution filter
should be given input samples in multiples of the block size \(B\). Otherwise,
the FFT of a block is computed twice as many times as would be necessary and
hence throughput is reduced.
@@ -50,4 +50,6 @@ if (ENABLE_DFT)
target_link_libraries(dft kfr_multidft)
target_compile_definitions(dft PRIVATE -DKFR_DFT_MULTI=1)
endif ()
add_executable(ccv ccv.cpp)
target_link_libraries(ccv kfr kfr_dft use_arch)
endif ()
@@ -0,0 +1,71 @@
* ccv, part of KFR (
* Copyright (C) 2019 D Levin
* See LICENSE.txt for details

// Complex convolution filter examples

#define CMT_BASETYPE_F32

#include <chrono>
#include <kfr/base.hpp>
#include <kfr/dft.hpp>
#include <kfr/dsp.hpp>

using namespace kfr;

int main()

// low-pass filter
univector<fbase, 1023> taps127;
expression_pointer<fbase> kaiser = to_pointer(window_kaiser(taps127.size(), 3.0));
fir_lowpass(taps127, 0.2, kaiser, true);

// Create filters.
size_t const block_size = 256;
convolve_filter<complex<fbase>> conv_filter_complex(univector<complex<fbase>>(make_complex(taps127, zeros())),
convolve_filter<fbase> conv_filter_real(taps127, block_size);

// Create noise to filter.
auto const size = 1024 * 100 + 33; // not a multiple of block_size
univector<complex<fbase>> cnoise =
make_complex(truncate(gen_random_range(random_bit_generator{ 1, 2, 3, 4 }, -1.f, +1.f), size),
truncate(gen_random_range(random_bit_generator{ 3, 4, 9, 8 }, -1.f, +1.f), size));
univector<fbase> noise =
truncate(gen_random_range(random_bit_generator{ 3, 4, 9, 8 }, -1.f, +1.f), size);

// Filter results.
univector<complex<fbase>> filtered_cnoise_ccv(size), filtered_cnoise_fir(size);
univector<fbase> filtered_noise_ccv(size), filtered_noise_fir(size);

// Complex filtering (time and compare).
auto tic = std::chrono::high_resolution_clock::now();
conv_filter_complex.apply(filtered_cnoise_ccv, cnoise);
auto toc = std::chrono::high_resolution_clock::now();
auto const ccv_time_complex = std::chrono::duration_cast<std::chrono::duration<float>>(toc - tic);
tic = toc;
filtered_cnoise_fir = kfr::fir(cnoise, taps127);
toc = std::chrono::high_resolution_clock::now();
auto const fir_time_complex = std::chrono::duration_cast<std::chrono::duration<float>>(toc - tic);
auto const cdiff = rms(cabs(filtered_cnoise_fir - filtered_cnoise_ccv));

// Real filtering (time and compare).
tic = std::chrono::high_resolution_clock::now();
conv_filter_real.apply(filtered_noise_ccv, noise);
toc = std::chrono::high_resolution_clock::now();
auto const ccv_time_real = std::chrono::duration_cast<std::chrono::duration<float>>(toc - tic);
tic = toc;
filtered_noise_fir = kfr::fir(noise, taps127);
toc = std::chrono::high_resolution_clock::now();
auto const fir_time_real = std::chrono::duration_cast<std::chrono::duration<float>>(toc - tic);
auto const diff = rms(filtered_noise_fir - filtered_noise_ccv);

println("complex: convolution_filter ", ccv_time_complex.count(), " fir ", fir_time_complex.count(), " diff=", cdiff);
println("real: convolution_filter ", ccv_time_real.count(), " fir ", fir_time_real.count(), " diff=", diff);

return 0;
@@ -84,6 +84,9 @@ class convolve_filter : public filter<T>
explicit convolve_filter(size_t size, size_t block_size = 1024);
explicit convolve_filter(const univector_ref<const T>& data, size_t block_size = 1024);
void set_data(const univector_ref<const T>& data);
void reset() final;
/// Apply filter to multiples of returned block size for optimal processing efficiency.
size_t input_block_size() const { return block_size; }

void process_expression(T* dest, const expression_pointer<T>& src, size_t size) final
@@ -93,19 +96,36 @@ class convolve_filter : public filter<T>
void process_buffer(T* output, const T* input, size_t size) final;

const size_t size;
using ST = subtype<T>;
static constexpr auto real_fft = !std::is_same<T, complex<ST>>::value;
using plan_t = std::conditional_t<real_fft, dft_plan_real<T>, dft_plan<ST>>;

// Length of filter data.
size_t data_size;
// Size of block to process.
const size_t block_size;
const dft_plan_real<T> fft;
// FFT plan for circular convolution.
const plan_t fft;
// Temp storage for FFT.
univector<u8> temp;
std::vector<univector<complex<T>>> segments;
std::vector<univector<complex<T>>> ir_segments;
size_t input_position;
// History of input segments after fwd DFT. History is circular relative to position below.
std::vector<univector<complex<ST>>> segments;
// Index into segments of current block.
size_t position;
// Blocks of filter/data after fwd DFT.
std::vector<univector<complex<ST>>> ir_segments;
// Saved input for current block.
univector<T> saved_input;
univector<complex<T>> premul;
univector<complex<T>> cscratch;
univector<T> scratch;
// Index into saved_input for next input to begin.
size_t input_position;
// Pre-multiplied products of input history and delayed filter blocks.
univector<complex<ST>> premul;
// Scratch buffer for product of filter and input for processing by reverse DFT.
univector<complex<ST>> cscratch;
// Scratch buffers for input and output of fwd and rev DFTs.
univector<T> scratch1, scratch2;
// Overlap saved from previous block to add into current block.
univector<T> overlap;
size_t position;
} // namespace CMT_ARCH_NAME

0 comments on commit f3bc9bb

Please sign in to comment.