# Hardware FM Demodulation on Pynq-Z2

In this notebook, we will deploy hardware demodulation on Pynq-Z2.

## 1. Software implementation time

First, let's see how long it takes for the software to demodulate one second of samples (there are 2.4M samples since the sample rate is 2.4MSPS). The software functions are already developed in the first notebook.

In [1]:
import time
import numpy as np
import scipy.signal as signal

sample_rate = 2.4e6 
center_freq = 94.1e6

samples = np.load('samples_prerecorded.npy')

def downsample(x, M, p=0):  
    if not isinstance(M, int):
        raise TypeError("M must be an int")
    x = x[0:int(np.floor(len(x) / M)) * M]
    x = x.reshape((int(np.floor(len(x) / M)), M))
    y = x[:,p]
    return y

def fm_discrim(x):
    X = np.real(x)
    Y = np.imag(x)
    b = np.array([1, -1])
    dY = signal.lfilter(b, 1, Y)
    dX = signal.lfilter(b, 1, X)
    discriminated = (X * dY - Y * dX) / (X**2 + Y**2 + 1e-10)
    return discriminated

def fm_audio(samples, fs=2.4e6, fc=94.1e6, fc1=200e3, fc2=100e3, d1=10, d2=5):
    lpf_b1 = signal.firwin(64, fc1/(float(fs)/2))
    lpf_b2 = signal.firwin(64, fc2/(float(fs)/2))
    
    # 1st filtering
    st = time.time()
    samples_filtered_1 = signal.lfilter(lpf_b1, 1, samples)
    et = time.time()
    print("1st filtering: %f seconds" % (et - st))
    
    # 1st decimation
    st = time.time()
    samples_decimated_1 = downsample(samples_filtered_1, d1)
    et = time.time()
    print("1st decimation: %f seconds" % (et - st))
    
    # phase discrimination
    st = time.time()
    samples_discriminated = fm_discrim(samples_decimated_1)
    et = time.time()
    print("phase discrimiation: %f seconds" % (et - st))
    
    # 2nd filtering
    st = time.time()
    samples_filtered_2 = signal.lfilter(lpf_b2, 1, samples_discriminated)
    et = time.time()
    print("2nd filtering: %f seconds" % (et - st))
    
    # 2nd decimation
    st = time.time()
    audio = downsample(samples_filtered_2, d2)
    et = time.time()
    print("2nd filtering: %f seconds" % (et - st))
    
    return audio

audio_sw = fm_audio(samples, fc=center_freq, fs=sample_rate)

1st filtering: 4.139437 seconds
1st decimation: 0.000192 seconds
phase discrimiation: 0.287228 seconds
2nd filtering: 0.137355 seconds
2nd filtering: 0.000147 seconds


In [2]:
from IPython.display import Audio
Audio(audio_sw, rate=48000)

## 2. Hardware Implementation Time

Next, let's see how long it takes for the hardware to demodulate one second of samples (2.4M samples since the sample rate is 2.4MSPS).

In [9]:
from pynq import Overlay, allocate

ol = Overlay("../overlay/system-128.bit")

In [10]:
for ip in ol.ip_dict:
    print(ip)

fir_complex_0
fir_real_0
fm_discrim_0
axi_dma_0
processing_system7_0


In [11]:
dma = ol.axi_dma_0
hw_fir_1 = ol.fir_complex_0
hw_fir_2 = ol.fir_real_0
hw_discrim = ol.fm_discrim_0

In [12]:
# 1) allocate

# 1.1) params

len_in = 2400000
len_mid = 240000
len_out = 48000

filter_factor = 1

lpf_b1 = signal.firwin(64, 200e3/(float(2.4e6)/2))
lpf_b2 = signal.firwin(64, 12e3/(float(2.4e6)/10/2))
c1 = np.array(lpf_b1 * filter_factor, dtype=np.float32)
c2 = np.array(lpf_b2 * filter_factor, dtype=np.float32)

# 1.2) allocate buffer

coef_buffer_1 = allocate(shape=(64,), dtype=np.float32)
coef_buffer_2 = allocate(shape=(64,), dtype=np.float32)

input_buffer = allocate(shape=(len_in,), dtype=np.complex64)
output_buffer = allocate(shape=(len_out,), dtype=np.float32)

# 1.3) init coef buffer

np.copyto(input_buffer, samples)
np.copyto(coef_buffer_1, c1)
np.copyto(coef_buffer_2, c2)

# 1.4) config phys addr

hw_fir_1.register_map.coef_1 = coef_buffer_1.physical_address
hw_fir_2.register_map.coef_1 = coef_buffer_2.physical_address
hw_fir_1.register_map.load_coef = 1
hw_fir_2.register_map.load_coef = 1

# 2) filtering and demodulation

# 2.1) copy input to buffer

np.copyto(input_buffer, samples + (128 + 128j))

# 2.2) start ip

st = time.time()

hw_fir_1.write(0x00, 0x01)
hw_fir_2.write(0x00, 0x01)
hw_discrim.write(0x00, 0x01)

dma.sendchannel.transfer(input_buffer)
dma.recvchannel.transfer(output_buffer)
dma.sendchannel.wait()
dma.recvchannel.wait()

et = time.time()

# 2.3) copy buffer to output

print("AXI Stream Implementation: %f ms" % ((et - st) * 1000))

AXI Stream Implementation: 315.752029 ms


In [13]:
from IPython.display import Audio
Audio(output_buffer, rate=48000)

As can be seen, the software implementation needs ~4.6 seconds to demodulate one second of samples (2.4M samples), while the hardware implementation only needs ~320ms. This means it is possible to achieve real-time demodulation using the hardware implementation.

# Conclusion

In this notebook, we have deployed hardware demodulation and demonstrated that it is much faster than the software implementation, which shows great potentials of FPGAs for real-time signal processing.

In fact, if it were not for the resource limitation on the xc7z020 chip (which is the one on Pynq-Z2), an even faster implementation that consumes more areas can be deployed. The current IP has only used the pipelining technique but has not explored a more parallelled execution which consumes more resources.