In [1]:
import numpy as np
import numpy.linalg as la
import scipy.fftpack as scifft

from math import ceil,log,cos,sin,pi
  
np.set_printoptions(linewidth=100)

### DCT-I Relation to Circular Convolution

From the paper, let $a,b$ be the input with size $n$. Create a transformation $x$ where $x_0 = a_0$ and

$$x_j = x_{2n-j} \textrm{ for } j = 1, \dots, n$$

Define $y_j$ similarly. Furthermore, define $z$ where

$$z = x \ast y = 2c$$

where $\ast$ indicates circulation convolution. We define $c = a \ast^I b$. We recall that

$$C_N^I(a \ast^I b) = (C_N^Ia) \odot (C_N^Ib)$$

$$a \ast^I b = \frac{1}{n} C_N^I \Big( (C_N^Ia) \odot (C_N^Ib) \Big) $$

In [2]:
def transformVec(v):
    # returns 2n-2 size input
    return np.append(v,v[1:-1][::-1])

def DCT_mat(n):
    # computes DCT(n)
    M = np.zeros((n,n))
    for u in range(n):
        for k in range(n):
            e = (.5 if k==0 or k==n-1 else 1)
            M[u,k] = ( e * cos((u*k*pi)/(n-1)) )
    return M

def DCTI_2n(a,b):
    # DCT-(N+1)
    n = len(a)
    assert(n == len(b))

    DC = DCT_mat(2*n+1)

    az = np.append(a,np.zeros(n+1))
    bz = np.append(b,np.zeros(n+1))

    c = np.dot(DC,az) * np.dot(DC,bz)
    return (2/n) * np.dot(DC, c)

In [3]:
n = 150
a = np.random.random(n)
b = np.random.random(n)
c = DCTI_2n(a,b)

# circular convolution
az = np.append(a,np.zeros(n+1))
bz = np.append(b,np.zeros(n+1))
x = transformVec(az)
y = transformVec(bz)
z = np.real(np.fft.ifft( np.fft.fft(x) * np.fft.fft(y) ))
z = z[:2*n+1]

print("Error:",la.norm(c-z)/la.norm(z))

Error: 8.610315698356249e-16


### Circular to Linear Convolution

[Proof](#) needed still.

We know we can convert circular convolution to linear convolution by padding the result with $n/2$ 
$0$s.

In [7]:
n = 3
a = np.random.random(n)
b = np.random.random(n)

# works for padding by n/2 0s
pad = (n+1)//2
az = np.append(np.zeros(pad), a)
bz = np.append(np.zeros(pad), b)

c = DCTI_2n(az,bz)
c = c[2*pad : 2*(n+pad)-1]

convLib = np.convolve(a,b)

print("Error:",la.norm(c - convLib)/la.norm(convLib))

Error: 1.8329231818637207e-16


If we only want a reduced convolution, we don't need to pad at all then. Instead, we extract the select $n$ terms.

In [8]:
n = 150
a = np.random.random(n)
b = np.random.random(n)

# works for padding by n/2 0s
pad = 0
c = DCTI_2n(a,b)
c = c[n-1:2*n-1]

convLib = np.convolve(a,b)
convLib = convLib[-n:]
print("Error:",la.norm(c - convLib)/la.norm(convLib))

Error: 2.0441780014244004e-15


### Fast Algorithms

In [9]:
# https://www.nayuki.io/page/fast-discrete-cosine-transform-algorithms
def fastDCT_III(vector, root=True, scaled=True):
    if root:
        vector = vector.copy() * (2 if scaled else 1)
        vector[0] /= 2.0
    n = vector.size
    if n == 1: return vector
    else:
        half = n // 2
        alpha = vector[0 : : 2].copy()
        beta  = vector[1 : : 2].copy()
        beta[1 : ] += vector[1 : n - 1 : 2]
        fastDCT_III(alpha, False)
        fastDCT_III(beta , False)
        beta /= np.cos(np.arange(0.5, half + 0.5) * (np.pi / n)) * 2.0
        vector[ : half] = alpha + beta
        vector[n - 1 : half - 1 : -1] = alpha - beta
        return vector

In [10]:
p = 10
n = 2**p
v = np.random.random(n)

dct3Lib = scifft.dct(v,type=3)
dct3_1lvl = fastDCT_III(v)

print("Error:",la.norm(dct3Lib - dct3_1lvl)/la.norm(dct3Lib))

Error: 3.589120707394252e-14


### Other Fast Algorithms

In [11]:
def fastDCT_Ib(v):
    # assume 2^p+1 array passed in
    N = len(v)-1
    if N <= 1:
        DC = DCT_mat(N+1)
        return np.dot(DC, v)*2
        
    N2 = N//2
    f = v[:N2+1] + v[-N2-1:][::-1]
    g = v[:N2] - v[-N2:][::-1]
    
    dct1 = fastDCT_Ib(f)
    dct3 = fastDCT_III(g)

    # intertwines the two arrays as even/odd indices
    return np.reshape(
                    np.vstack(( dct1, np.append(dct3, np.zeros(1))) ), 
            newshape=(N+2,), order='F')[:-1]

def DCT_conv(f,g):
    # low-rank convolution
    n = len(f)
    assert(n == 2**(int(log(n,2))))
    fz = np.append(f,np.zeros(n+1))
    gz = np.append(g,np.zeros(n+1))
    
    c = fastDCT_Ib(fz) * fastDCT_Ib(gz)
    return (1/(4*n)) * fastDCT_Ib(c)[n-1:2*n-1]

In [12]:
p = 2
n = 2**p
f = np.random.random(n)
g = np.random.random(n)

c = DCT_conv(f,g)
convLib = np.convolve(f,g)
convLib = convLib[-n:]

print("Error:",la.norm(c - convLib)/la.norm(convLib))

Error: 2.0262979624825808e-16


### Flops for DCT-based Convolution

We first evalutate the cost of **DCT-III(N)**.

The closed form will be 

$$\Theta(nlog(n))$$

At one level, we have

1. $\frac{n}{2}$ adds
1. $2$ calls to DCT($N/2$)
1. $\frac{n}{2}$ cosine evals, multiplies, and divides
1. $2$ $\frac{n}{2}$ adds and subtracts

The base case is nothing, so it is negligable. So at each level, we have a total of $(\frac{3}{2}n, n)$ real flops where $(\cdot, \cdot)$ represents the adds and multiplies respectively. Since there are $log(n)$ levels, we expect the total work for DCT-III($N$) to be 

$$\bigg(\frac{3}{2}nlog(n) \ , \ nlog(n) \bigg)$$

Now we evaluate the cost of **DCT-I(N)**. In each level, we have

1. $\frac{n}{2}$ adds and subtracts
1. One call to DCT-I($n/2$)
1. One call to DCT-III($n/2$), denote this as $T_3(n)$ for size $n$
1. $n$ reordering

So in each level, we have a flop count of:

$$\bigg( \frac{n}{2}, 0, n \bigg) + \bigg(\frac{3}{2}nlog(n) \ , \ nlog(n), 0 \bigg)$$

At the child level, we expect $n$ children. Each one requires $2$ adds $2^2$  multiplies. However, because of the linearaity of the recursive calls, we expect this cost to be negigible since there would only be one leaf child.

To get the total flop cost, we see this is just a linear tree-like structure since we only have one recursive call to DCT-I. Thus we can bound the total flops as

$$
\Rightarrow n \cdot \Bigg( \sum\limits_{i=0}^{log(n)-1} \frac{1}{2^{i}} \bigg[ \Big(\frac{1}{2}, 0, 1 \Big)
+ \Big( \frac{3}{2}(log(n)-i) \ , \ log(n)-i \ , \ 0 \Big) \bigg] \Bigg)
$$

$$
= n \cdot \Bigg( \sum\limits_{i=0}^{log(n)-1} \frac{1}{2^{i}} \bigg[ \Big(\frac{1}{2} + \frac{3}{2}log(n), log(n), 1 \Big)
- \Big( \frac{3}{2}i \ , \ i \ , \ 0 \Big) \bigg] \Bigg)
$$

$$
= \Bigg[ n \Big(\frac{1}{2} + \frac{3}{2}log(n), log(n), 1 \Big) \cdot \sum\limits_{i=0}^{log(n)-1} \frac{1}{2^{i}} \Bigg]
- \Bigg[ n \cdot \Bigg( \sum\limits_{i=0}^{log(n)-1} \frac{1}{2^{i}} \Big( \frac{3}{2}i \ , \ i \ , \ 0 \Big) \Bigg) \Bigg]
$$

$$
\le \Bigg[ n \Big(\frac{1}{2} + \frac{3}{2}log(n), log(n), 1 \Big) \cdot \sum\limits_{i=0}^{log(n)-1} \frac{1}{2^{i}} \Bigg]
- \Bigg[ n \cdot \Bigg( \sum\limits_{i=0}^{log(n)-1} \frac{1}{2^{i}} \Big( \frac{3}{2} \ , \ 1 \ , \ 0 \Big) \Bigg) \Bigg]
$$

$$
= n \Big(-1 + \frac{3}{2}log(n), -1 + log(n), 1 \Big) \cdot \sum\limits_{i=0}^{log(n)-1} \frac{1}{2^{i}}
= \Big(-n + \frac{3}{2}nlog(n), -n + nlog(n), n \Big) \cdot \Big(2 - \frac{2}{n}\Big)
$$

$$
\le \Big(-2n +3nlog(n), -2n + 2nlog(n), 2n \Big)
$$

[Partial Sum](https://www.math.utah.edu/~carlson/teaching/calculus/series.html)

So in total, because we double the size of the input, our reduced fast-DCT-I requires

$$
\le \Big(2n +6nlog(n), 4nlog(n), 4n \Big)
$$

real flops. Since we call it $3$ times, we expect our flops to be bounded by

$$
\le \Big(6n +18nlog(n), 12nlog(n), 12n \Big)
$$

In [13]:
def fastDCT_III_flops(vector, root=True, scaled=True):
    flops = np.zeros(2)
    if root:
        vector = vector.copy() * (2 if scaled else 1)
        vector[0] /= 2.0
    n = vector.size
    if n == 1:
        return flops
    else:
        half = n // 2
        alpha = vector[0 : : 2].copy()
        beta  = vector[1 : : 2].copy()
        flops[0] *= half # beta[1 : ] += vector[1 : n - 1 : 2]
        flops += fastDCT_III_flops(alpha, False)
        flops += fastDCT_III_flops(beta , False)
        flops[1] += half # beta /= np.cos(np.arange(0.5, half + 0.5) * (np.pi / n)) * 2.0
        flops[0] += half # vector[ : half] = alpha + beta
        flops[0] += half # vector[n - 1 : half - 1 : -1] = alpha - beta
        return flops

def fastDCT_Ib_flops(v):
    # assume 2^p+1 array passed in
    N = len(v)-1
    if N <= 1:
        return np.asarray([2,4])
        # DC = DCT_mat(N+1); return np.dot(DC, v)*2
        
    flops = np.zeros(2)
    N2 = N//2
    f = v[:N2+1]
    flops[0] += N2
    g = v[:N2]
    flops[0] += N2
    
    flops += fastDCT_Ib_flops(f) # dct1 = fastDCT_Ib(f)
    flops += fastDCT_III_flops(g , False) # dct3 = fastDCT_III(g)

    return flops

def DCT_conv_flops(f,g):
    # low-rank convolution
    n = len(f)
    assert(n == 2**(int(log(n,2))))
    fz = np.append(f,np.zeros(n+1))
    
    return 3 * fastDCT_Ib_flops(fz)

In [14]:
p = 14
n = 2**p
f = np.random.random(n)
g = np.random.random(n)

flops = DCT_conv_flops(f,g)
print("Actual flops:",flops)

predicted = np.asarray([int(6*n +18*n*log(n)), int(12*n*log(n))])
print("Predicted flops:",predicted)

Actual flops: [1474566.  638991.]
Predicted flops: [2960147 1907895]
