In [1]:
#Load necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.wavfile import read, write
from numpy.fft import fft, ifft
from IPython.display import Audio
from scipy import signal
from ipykernel import kernelapp as app

In [2]:
FRAME_SIZE = 1024

In [3]:
# Calculate the number of frames and define the arrays to hold correlations
# for the spectrums of signals and noise
numFrames = 234
crowd_filter = np.zeros(FRAME_SIZE)
ss = np.zeros((570 ,FRAME_SIZE))
nn = np.zeros((570 ,FRAME_SIZE))

In [4]:
# An algorithm to detect whether a frame is voiced or not
def voice_decider(frame):
    isVoiced = 0
    energy = 0
    
    for i in range(len(frame)):
        energy = energy + frame[i]*frame[i]

    if(energy>70000000):
        isVoiced = 1
        
    return isVoiced

In [5]:
total = 0 #Counter for voiced frame

# Use 5 pairs of clean signals and noise to train filters for
# helicopter noise
for i in range(5):
    Fs, speech = read('Ky-'+str(i+1)+'.wav')
    Fs1, noise = read('Heli-'+str(i+1)+'.wav')
    
    for j in range(numFrames):
        # Get data from clean signals
        speech_data = speech[j*FRAME_SIZE : (j+1)*FRAME_SIZE]
    
        # We don't involve unvoiced frames into training
        if(voice_decider(speech_data.astype(float)) == 0):
            continue
        
        # Get data from noise
        noise_data = noise[j*FRAME_SIZE : (j+1)*FRAME_SIZE]
        
        # Normalize clean speech signals
        speech_average = np.mean(speech_data)
        speech_std = np.std(speech_data)
        speech_data = (speech_data - speech_average)/speech_std
        
        # Normalize noise signals
        noise_average = np.mean(noise_data)
        noise_std = np.std(noise_data)
        noise_data = (noise_data - noise_average)/noise_std
        
        # Calculate the power spectrums of clean signals and noise
        speech_spectrum = np.fft.fft(speech_data)
        noise_spectrum = np.fft.fft(noise_data)
        
        # Record relevant information about clean signals and noise
        speech_spectrum = np.abs(speech_spectrum)
        speech_spectrum = np.power(speech_spectrum, 2)
        ss[total, :] = speech_spectrum
        
        noise_spectrum = np.abs(noise_spectrum)
        noise_spectrum = np.power(noise_spectrum, 2)
        nn[total, :] = noise_spectrum
            
        total = total+1    
            

In [6]:
# Calculate the trained filter
for i in range(FRAME_SIZE):
    s = np.mean(ss[:, i])
    n = np.mean(nn[:, i])
    
    crowd_filter[i] = ((s) / (s+n))**2

In [7]:
# Testing trained helicopter filter on contaminated
# signal by helicopter noise
Fs, data = read('Helicopter-ky1.wav')
numFrames = int(len(data) / FRAME_SIZE)
output_crowd = np.zeros(len(data))
window_num = 0
noise_spectrum = np.zeros(FRAME_SIZE)

In [8]:
# Apply the trained filter
for i in range(numFrames):
    frame = data[i * FRAME_SIZE : ((i+1) * FRAME_SIZE)]
    
    DFT = np.fft.fft(frame)
    
    DFT1 = np.multiply(DFT, crowd_filter)
    
    output_crowd[i * FRAME_SIZE : ((i+1) * FRAME_SIZE)] = np.fft.ifft(DFT1)

  


In [9]:
# Play out the original audio
Audio(data, rate=Fs)

In [10]:
# Play out the processed audio
Audio(output_crowd, rate=Fs)

In [11]:
total = 0 #Counter for voiced frame

# Use 5 pairs of clean signals and noise to train filters for
# crowd noise
for i in range(5):
    Fs, speech = read('Ky-'+str(i+1)+'.wav')
    Fs1, noise = read('Crowd-'+str(i+1)+'.wav')
    
    for j in range(numFrames):
        # Get data from clean signals
        speech_data = speech[j*FRAME_SIZE : (j+1)*FRAME_SIZE]
    
        # We don't involve unvoiced frames into training
        if(voice_decider(speech_data.astype(float)) == 0):
            continue
        
        # Get data from noise
        noise_data = noise[j*FRAME_SIZE : (j+1)*FRAME_SIZE]
        
        # Normalize clean speech signals
        speech_average = np.mean(speech_data)
        speech_std = np.std(speech_data)
        speech_data = (speech_data - speech_average)/speech_std
        
        # Normalize noise signals
        noise_average = np.mean(noise_data)
        noise_std = np.std(noise_data)
        noise_data = (noise_data - noise_average)/noise_std
        
        # Calculate the power spectrums of clean signals and noise
        speech_spectrum = np.fft.fft(speech_data)
        noise_spectrum = np.fft.fft(noise_data)
        
        # Record relevant information about clean signals and noise
        speech_spectrum = np.abs(speech_spectrum)
        speech_spectrum = np.power(speech_spectrum, 2)
        ss[total, :] = speech_spectrum
        
        noise_spectrum = np.abs(noise_spectrum)
        noise_spectrum = np.power(noise_spectrum, 2)
        nn[total, :] = noise_spectrum
            
        total = total+1

In [12]:
# Calculate the trained filter
for i in range(FRAME_SIZE):
    s = np.mean(ss[:, i])
    n = np.mean(nn[:, i])
    
    crowd_filter[i] = ((s) / (s+n))**2

In [13]:
# Testing trained crowd filter on contaminated
# signal by crowd noise
Fs, data = read('Crowd-ky1.wav')
numFrames = int(len(data) / FRAME_SIZE)
output_crowd = np.zeros(len(data))
window_num = 0
noise_spectrum = np.zeros(FRAME_SIZE)

In [14]:
# Apply the trained filter
for i in range(numFrames):
    frame = data[i * FRAME_SIZE : ((i+1) * FRAME_SIZE)]
    
    DFT = np.fft.fft(frame)
    
    DFT1 = np.multiply(DFT, crowd_filter)
    
    output_crowd[i * FRAME_SIZE : ((i+1) * FRAME_SIZE)] = np.fft.ifft(DFT1)

  if __name__ == '__main__':


In [15]:
# Play out the original audio
Audio(data, rate=Fs)

In [17]:
# Play out the processed audio
Audio(output_crowd, rate=Fs)

In [18]:
total = 0 #Counter for voiced frame

# Use 5 pairs of clean signals and noise to train filters for
# engine noise
for i in range(5):
    Fs, speech = read('Ky-'+str(i+1)+'.wav')
    Fs1, noise = read('Engine-'+str(i+1)+'.wav')
    
    for j in range(numFrames):
        # Get data from clean signals
        speech_data = speech[j*FRAME_SIZE : (j+1)*FRAME_SIZE]
    
        # We don't involve unvoiced frames into training
        if(voice_decider(speech_data.astype(float)) == 0):
            continue
        
        # Get data from noise
        noise_data = noise[j*FRAME_SIZE : (j+1)*FRAME_SIZE]
        
        # Normalize clean speech signals
        speech_average = np.mean(speech_data)
        speech_std = np.std(speech_data)
        speech_data = (speech_data - speech_average)/speech_std
        
        # Normalize noise signals
        noise_average = np.mean(noise_data)
        noise_std = np.std(noise_data)
        noise_data = (noise_data - noise_average)/noise_std
        
        # Calculate the power spectrums of clean signals and noise
        speech_spectrum = np.fft.fft(speech_data)
        noise_spectrum = np.fft.fft(noise_data)
        
        # Record relevant information about clean signals and noise
        speech_spectrum = np.abs(speech_spectrum)
        speech_spectrum = np.power(speech_spectrum, 2)
        ss[total, :] = speech_spectrum
        
        noise_spectrum = np.abs(noise_spectrum)
        noise_spectrum = np.power(noise_spectrum, 2)
        nn[total, :] = noise_spectrum
            
        total = total+1

In [19]:
# Calculate the trained filter
for i in range(FRAME_SIZE):
    s = np.mean(ss[:, i])
    n = np.mean(nn[:, i])
    
    crowd_filter[i] = ((s) / (s+n))**2

In [20]:
# Testing trained engine filter on contaminated
# signal by engine noise
Fs, data = read('Engine-ky1.wav')
numFrames = int(len(data) / FRAME_SIZE)
output_crowd = np.zeros(len(data))
window_num = 0
noise_spectrum = np.zeros(FRAME_SIZE)

In [21]:
# Apply the trained filter
for i in range(numFrames):
    frame = data[i * FRAME_SIZE : ((i+1) * FRAME_SIZE)]
    
    DFT = np.fft.fft(frame)
    
    DFT1 = np.multiply(DFT, crowd_filter)
    
    output_crowd[i * FRAME_SIZE : ((i+1) * FRAME_SIZE)] = np.fft.ifft(DFT1)

  if __name__ == '__main__':


In [22]:
# Play out the original audio
Audio(data, rate=Fs)

In [23]:
# Play out the processed audio
Audio(output_crowd, rate=Fs)

When we listen to the three outputs, we can see that the sound qualities of the crowd filter are pretty good. In comparison, the output sound qualities for the helicopter filter and the engine filter are medium. We think there are mainly two reasons behind this. First, the mathematical model of the Wiener filter assumes that noise samples are uncorrelated with signal samples. This is reasonable for crowd noise, which is very close to white noise. However, helicopter noise and engine noise have their own rhythms and are thus not quite uncorrelated with speech signals. Second, we only have a limited amount of data for training. Therefore, the trained Wiener filters aren't quite robost for other type of speech signals.

To address this issue, we propose another scheme, which is implemented in the "New Scheme" file. In that scheme, we collect statistics about noise spectrum during unvoiced frames. Then, during voiced frames, we first use previously collected statistics to carry out spectrum subtractions. We then use the resulted signal as an estimate of the clean signal and calculate its power spectrum. With these computed, we are able to calculate the Wiener filter and apply that to the frame. The mathematical model still assumes that signals and noise are uncorrelated. Therefore, it works well for whie noise, like crowd noise, rain noise and fan noise.