# Exercise 6: Speech Enhancement

In [1]:
from typing import Tuple

import librosa
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import get_window, lfilter
from scipy.linalg import solve_toeplitz
from IPython.display import Audio
import math


%matplotlib inline
plt.rcParams["figure.figsize"] = (11, 4)

In [2]:
# "Import" functions from previous exercises

import requests

gist = requests.get(
    "https://gist.githubusercontent.com/iibrahimli/3c50f73020c78aeee1de68ae5a0ba5e7/raw/import_funcs.py"
).text
print("Downloaded import_funcs.py")
exec(gist)

import_function_from_ipynb(
    "exercise1.ipynb",
    [
        "plot_signal",
        "my_windowing",
        "acf",
        "estimate_fundamental_frequency",
    ],
)
import_function_from_ipynb(
    "exercise2.ipynb",
    [
        "compute_stft",
    ]
)
import_function_from_ipynb(
    "exercise3.ipynb",
    [
        "estimate_filter",
        "plot_dft",
    ]
)

Downloaded import_funcs.py
Found exercise1.ipynb: C:\Users\Natia_Mestvirishvili\Desktop\UHH\SSP\ssp_sose2023\Exercise1\exercise1.ipynb
Executed function plot_signal
Executed function my_windowing
Executed function acf
Executed function estimate_fundamental_frequency
Successfully imported functions: ['plot_signal', 'my_windowing', 'acf', 'estimate_fundamental_frequency']
Found exercise2.ipynb: C:\Users\Natia_Mestvirishvili\Desktop\UHH\SSP\ssp_sose2023\Exercise2\exercise2.ipynb
Executed function compute_stft
Successfully imported functions: ['compute_stft']
Found exercise3.ipynb: C:\Users\Natia_Mestvirishvili\Desktop\UHH\SSP\ssp_sose2023\Exercise3\exercise3.ipynb
Executed function estimate_filter
Executed function plot_dft
Successfully imported functions: ['estimate_filter', 'plot_dft']


## 1 Noise Power Estimation

In [3]:
x_white, fs = librosa.core.load("AudioFiles/SpeechWhite.wav", sr=None)
print(f"Samples: {len(x_white)}, sampling frequency: {fs} Hz")

Samples: 69466, sampling frequency: 16000 Hz


In [4]:
x_babble, fs = librosa.core.load("AudioFiles/SpeechBabble.wav", sr=None)
print(f"Samples: {len(x_babble)}, sampling frequency: {fs} Hz")

Samples: 69466, sampling frequency: 16000 Hz


In [5]:
# Compute STFT with 
frame_length = 32
frame_shift = 16
sqrt_hann = np.sqrt(get_window("hann", frame_length * fs // 1000))
m_stft, v_freq, frame_centers = compute_stft(x_white, fs, frame_length, frame_shift, sqrt_hann)

Output matches np.fft.rfft: True


In [6]:
# Compute periodogram of speech signal
periodogram = np.square(m_stft) # idk?

# Initialize empty matrices
periodogram_noise_sq = np.zeros((m_stft.shape[0], m_stft.shape[1]))
m_spp =  np.zeros((m_stft.shape[0], m_stft.shape[1]))
m_noise_psd = np.zeros((m_stft.shape[0], m_stft.shape[1]))
Q = np.zeros((m_stft.shape[0], m_stft.shape[1]))

In [7]:
theta = 15
init_sigma_hat = 0.99
init_Q = 0.01

In [8]:
for l in range(0, periodogram.shape[1]):
    for k in range(0, periodogram.shape[0]):
        Y_k_l_squared = math.pow(periodogram[k][l], 2)
        if (l==0):
            sigma_hat_sq = init_sigma_hat
        else:
            sigma_hat_sq = m_noise_psd[k][l-1]
        
        p =  math.pow((math.exp(-1 * Y_k_l_squared * theta / (sigma_hat_sq * (1 + theta))) * (1 + theta) + 1), -1)
        if (l==0):
            Q_k_l = init_Q
        else:
            Q_k_l = 0.9 * Q[k][l-1] + 0.1 * p
        
        if (Q_k_l > 0.99):
            p = min(0.99, p)
        m_spp[k][l] = p
        
        periodogram_noise_sq[k][l] = p * sigma_hat_sq + (1 - p) * abs(Y_k_l_squared)
        m_noise_psd[k][l] = 0.8 * sigma_hat_sq + 0.2 * periodogram_noise_sq[k][l]
        

  Y_k_l_squared = math.pow(periodogram[k][l], 2)


In [9]:
m_spp

array([[0.06003583, 0.10429186, 0.08574982, ..., 1.        , 0.05919796,
        1.        ],
       [0.06062116, 0.05882356, 0.0600817 , ..., 1.        , 0.14277548,
        0.06814633],
       [0.64638109, 0.06887647, 0.06045407, ..., 0.99848228, 0.53760215,
        1.        ],
       ...,
       [0.08824805, 0.05986768, 0.05882354, ..., 0.13605715, 1.        ,
        0.99979477],
       [0.12747706, 0.06103337, 0.06273416, ..., 1.        , 0.28738989,
        0.05885794],
       [0.16195742, 0.05883571, 0.0600112 , ..., 1.        , 1.        ,
        0.93044395]])

In [10]:
# TODO Plot speech presence probabilit
# TODO Plot estimated noise PSD 

## 2 Priori SNR estimation and Wiener filtering

In [11]:
m_enhanced_stft = np.zeros((m_stft.shape[0], m_stft.shape[1]))
init_S = 0 # TODO IDK how to init this
priori_estimates = np.zeros((m_stft.shape[0], m_stft.shape[1]))

alpha = 0.5
G_min = 0

for l in range(0, priori_estimates.shape[1]):
    for k in range(0, priori_estimates.shape[0]):
        if (l==0):
            S = init_S
        else:
            S = math.pow(abs(m_enhanced_stft[k][l-1]), 2)
        
        priori = alpha * S / m_noise_psd[k][l-1]
        priori += (1-alpha)*max((math.pow(periodogram[k][l],2)/m_noise_psd[k][l] - 1),0)
        
        G = max(priori/(1+priori), G_min)
        m_enhanced_stft[k][l] = G * periodogram[k][l]
        

  priori += (1-alpha)*max((math.pow(periodogram[k][l],2)/m_noise_psd[k][l] - 1),0) # max among what? weird
  m_enhanced_stft[k][l] = G * periodogram[k][l]
