In [1]:
import math
import numpy as np
import scipy.signal
import matplotlib.pyplot as plt
import wavio
import IPython
from scipy.special import expn
from collections import namedtuple

In [2]:
def logMMSE(xn, sr, frameSize):
    """
       [1] Ephraim, Y. and Malah, D. (1985). Speech enhancement using a minimum
       mean-square error log-spectral amplitude estimator. IEEE Trans. Acoust.,
       Speech, Signal Process., ASSP-23(2), 443-445.
    """

    # window overlap in percent of frame size
    stepSize = frameSize//2
    len2 = frameSize - stepSize

    win = np.hanning(frameSize)
    win = win * len2 / sum(win)

    # Noise magnitude calculations - assuming that the first 6 frames is noise / silence
    nFFT = frameSize << 2
    noise_mean = np.zeros([nFFT, 1])
    dtype = 2 << 14
    j = 0

    for i in range(1, 7):

        s1 = j
        s2 = j + int(frameSize)

        batch = xn[s1: s2] / dtype

        X = win * batch

        foo = np.fft.fft(X, int(nFFT))

        noise_mean += np.abs(foo.reshape(foo.shape[0], 1))

        j += frameSize

    noise_mu = np.square(noise_mean / 6)

    # Allocate memory and initialize various variables
    x_old = np.zeros([stepSize, 1])
    Nframes = int(np.floor(xn.shape[0] / len2) - np.floor(frameSize / len2))
    xfinal = np.zeros([int(Nframes * len2), 1])

    # Start Processing
    k = 0
    aa = 0.98
    mu = 0.98
    eta = 0.15

    ksi_min = 10 ** (-25 * 0.1)

    for n in range(Nframes):

        s1 = k
        s2 = k + int(frameSize)

        batch = xn[s1: s2] / dtype
        insign = win * batch

        spec = np.fft.fft(insign, nFFT)

        # Compute the magnitude
        sig = abs(spec)
        sig2 = sig ** 2

        # Limit post SNR to avoid overflows
        gammak = np.divide(sig2.reshape(sig2.shape[0], 1), noise_mu.reshape(noise_mu.shape[0], 1))
        gammak[gammak > 40] = 40

        foo = gammak - 1
        foo[foo < 0] = 0

        if 0 == n:
            ksi = aa + (1 - aa) * foo
        else:

            # a priori SNR
            ksi = aa * Xk_prev / noise_mu + (1 - aa) * foo

            # limit ksi to - 25 db
            ksi[ksi < ksi_min] = ksi_min

        log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
        vad_decision = sum(log_sigma_k) / frameSize

        # noise only frame found
        if vad_decision < eta:
            noise_mu = mu * noise_mu + (1 - mu) * sig2.reshape([sig2.shape[0], 1])

        # == = end of vad == =

        # Log - MMSE estimator
        A = ksi / (1 + ksi)
        vk = A * gammak

        ei_vk = 0.5 * expn(1, vk)
        hw = A * np.exp(ei_vk)

        sig = sig.reshape([sig.shape[0], 1]) * hw
        Xk_prev = sig ** 2

        xi_w = np.fft.ifft(hw * spec.reshape([spec.shape[0], 1]), nFFT, 0)
        xi_w = np.real(xi_w)

        xfinal[k: k + int(len2)] = x_old + xi_w[0: int(stepSize)]
        x_old = xi_w[stepSize: frameSize]

        k = k + int(len2)
    
    return xfinal

In [3]:
wav = wavio.read("automotive_noise.wav")
sr = wav.rate
left_channel = wav.data[:, 0]
left_channel = left_channel.astype(np.float32, order='C') / 32767.0

output = logMMSE(left_channel, sr, 256)
wavio.write("result1.wav", output, sr, sampwidth=3)
IPython.display.display(IPython.display.Audio("result1.wav", rate=sr))
IPython.display.display(IPython.display.Audio("automotive_noise.wav", rate=sr))

In [4]:
wav = wavio.read("keyboard_noise.wav")
sr = wav.rate
left_channel = wav.data[:, 0]
left_channel = left_channel.astype(np.float32, order='C') / 32767.0

output = logMMSE(left_channel, sr, 256)
wavio.write("result2.wav", output, sr, sampwidth=3)
IPython.display.display(IPython.display.Audio("result2.wav", rate=sr))
IPython.display.display(IPython.display.Audio("keyboard_noise.wav", rate=sr))