In [1]:
import librosa
import numpy as np
import soundfile as sf
from tqdm import tqdm
def WienerFilter(filename, sr=16000, n_fft=512, hop_rate=0.5, vad_db=5, gamma=1.0, G=.6):
    x, _ = librosa.load(filename, sr=sr)

    hop = int(hop_rate*n_fft)         # hop size in samples

    X = librosa.stft(x, n_fft=n_fft, hop_length=hop)
    # setting default parameters
    vad_db = 5       # VAD vad_dbhold in dB SNRseg
    gamma = 1.0     # exp(gamma)
    G = .6 #smoothing factor

    noise_mean = np.zeros((n_fft//2+1))
    for k in range(0, 5):
        noise_mean = noise_mean + abs(X[:, k])

    # noise estimate from first 5 frames
    noise_mu = noise_mean / 5

    # initialize various variables
    img = 1j
    X_out = np.zeros(X.shape, dtype=complex)

    # main processing loop
    for n in tqdm(range(0, X.shape[1])):
        # extract a frame
        signal_spec = X[:, n]
        # compute the magnitude
        signal_magnitude = abs(signal_spec)
        # save the noisy phase information
        theta = np.angle(signal_spec)
        #  compute segmental SNR for VAD
        SNRseg = 10 * np.log10(np.linalg.norm(signal_magnitude, 2) ** 2 / np.linalg.norm(noise_mu, 2) ** 2)

        # perform the spectral subtraction
        clean_signal_magnitude = signal_magnitude ** gamma - noise_mu ** gamma

        # halfwave rectification (zero out negative values)
        clean_signal_magnitude = np.maximum(clean_signal_magnitude, 0)

        # compute a Priori SNR (used)
        SNRpri = 10 * np.log10(np.linalg.norm(clean_signal_magnitude, 2) ** 2 / np.linalg.norm(noise_mu, 2) ** 2)

        # parameter band dependent oversubtraction factor
        mu_max = 20
        mu_to_plus, mu_to_min = 1, mu_max
        mu_slope = ((mu_to_min - mu_to_plus) * mu_max) / 25
        mu_0 = mu_to_plus + 20*mu_slope
        def get_alpha(SNR):
            if SNR >= 20:
                 return mu_to_plus
            elif -5.0 <= SNR <= 20.0:
                return mu_0 - SNR*mu_slope
            else: return mu_to_min
        alpha = get_alpha(SNRpri) 

        # 2 gain function G
        # This is essentially the inverse Wiener Filter
        G_i = clean_signal_magnitude ** 2 / (clean_signal_magnitude ** 2 + alpha * noise_mu ** 2)
        
        wf_speech = G_i * signal_magnitude

        # --- implement a simple VAD detector --- #
        if SNRseg < vad_db:  # Update noise spectrum
            noise_temp = G * noise_mu ** gamma + (1 - G) * signal_magnitude ** gamma  # noise power spectrum smoothing
            noise_mu = noise_temp ** (1 / gamma)  # New noise amplitude spectrum
            clean_signal_magnitude = .2*signal_magnitude  # suppress the signal    
        # add phase    
        phased_clean_signal = (wf_speech ** (1 / gamma)) * np.exp(img * theta)       
        # store the output
        X_out[:, n] = phased_clean_signal
        signal = librosa.istft(X_out, hop_length=hop, n_fft=n_fft)
        outfile = filename.split('.')[0] + '_denoised.wav'
        sf.write(outfile, signal, sr)
        # return list(signal)
WienerFilter('test.wav') 

100%|██████████| 128/128 [00:01<00:00, 93.70it/s] 


In [24]:
import librosa
import numpy as np
from tensorflow._api.v2.test import is_built_with_cuda, gpu_device_name
from tensorflow.python.keras import models
import soundfile as sf
from tqdm import tqdm
import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

is_cuda = is_built_with_cuda()
is_gpu = gpu_device_name()
model_path = os.path.join(os.getcwd(),'saved','models', 'fcnn_AN.model')
error = None

if not os.path.exists(model_path):
    print("Error. Model or Weights not found. Please download the model and weights from the repository and place them in the saved/models folder. Alternatively, you can run the train the model on your own data.")
    exit()
model = models.load_model(model_path)
print("Model Available: " + ('\u274C', '\u2705')[int(model is not None)])
print("Cuda Available: " + ('\u274C', '\u2705')[int(is_cuda)])
print("GPU Available: " + ('\u274C', '\u2705')[int(is_cuda)])

def DeepDenoise(filename, sr=16000, segment_time=0):
    # Segment time is used to split the audio into segments of X seconds for inference
    # SR = 16000 is what i used for training as specified in the paper
    # So, I'm assuming that's what is appropriate for the inference as well
    x, _ = librosa.load(filename, sr=sr)
    # print(x.shape)
    
    # Implement split into segments, its okay if the last segment is less than 1.5 seconds.
    if segment_time > 0:
        max_samples = int(sr*segment_time)
        x_out = []
        for i in (range(0, len(x), max_samples)):
            x_i = x[i:i+max_samples]
            # print (x_i.shape)
            x_i = np.expand_dims(np.expand_dims(x_i, axis=0), -1)
            x_i_out = model.predict(x_i)
            # print (x_i_out.shape)
            x_out.append(x_i_out)
        x_out = np.concatenate(x_out, axis=1).squeeze()
    else:
        x_out = model.predict(np.expand_dims(np.expand_dims(x, axis=0), -1)).squeeze()
    # print(x_out.shape)
    outfile = filename.split('.')[0] + '_ml_denoised.wav'
    sf.write(outfile, x_out, sr)

DeepDenoise('input.wav')

Model Available: ✅
Cuda Available: ✅
GPU Available: ✅
