<a href="https://colab.research.google.com/github/k2-fsa/colab/blob/master/fbank_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 介绍
本notebook 演示如何计算语音识别中的 fbank 特征。


# 生产一段随机的数据

- 假设采样率为 16000
- 生成 1 秒的随机数


In [None]:
import numpy as np
np.random.seed(202250927)

sample_rate = 16000
num_seconds = 1
samples = np.random.uniform(low=-1, high=1, size=(sample_rate * num_seconds,))
print(samples.shape)

(16000,)


# 计算前两帧特征

为了方便起见，我们只计算前3帧特征

In [None]:
frame_length_ms = 25
frame_shift_ms = 10

frame_length = int(frame_length_ms * sample_rate / 1000)
frame_shift = int(frame_shift_ms * sample_rate / 1000)

frame_samples_0 = samples[:frame_length]
frame_samples_1 = samples[1*frame_shift:(1*frame_shift + frame_length)]
frame_samples_2 = samples[2*frame_shift:(2*frame_shift + frame_length)]


print(frame_samples_0.shape, frame_samples_1.shape, frame_samples_2.shape)

(400,) (400,) (400,)


## 去掉直流偏移

In [None]:
def remove_dc_offset(samples):
  mean = np.mean(samples)
  return samples - mean

## 预加重

In [None]:
def preemphasize(samples, coeff=0.97):
  ans = np.empty_like(samples)

  ans[0] = samples[0] - coeff * samples[0]
  ans[1:] = samples[1:] - coeff * samples[:-1]

  return ans


## 加窗

In [None]:
def get_hann_window(n: int):
  # 请看 https://docs.pytorch.org/docs/stable/generated/torch.hann_window.html
  k = np.arange(n)
  return 0.5 * (1 - np.cos(2 * np.pi * k / (n - 1)))

def apply_window(samples, window):
  return samples * window

## 计算 FFT

In [None]:
def compute_fft(samples, nfft=512):
    return np.fft.rfft(samples, nfft)

## 计算 功率谱

In [None]:
def compute_power_spectrum(fft_bins):
  return np.abs(fft_bins) ** 2

## 乘以 Mel filter bank 矩阵

In [None]:
%%shell

pip install kaldi-native-fbank

Collecting kaldi-native-fbank
  Downloading kaldi_native_fbank-1.22.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.3 kB)
Downloading kaldi_native_fbank-1.22.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.0/322.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaldi-native-fbank
Successfully installed kaldi-native-fbank-1.22.2




In [None]:
import kaldi_native_fbank as knf

def get_mel_filter_bank_matrix():
    mel_opts = knf.MelBanksOptions()
    mel_opts.num_bins = 23

    frame_opts = knf.FrameExtractionOptions()
    mel_bank = knf.MelBanks(opts=mel_opts, frame_opts=frame_opts)
    return mel_bank.get_matrix()

matrix = get_mel_filter_bank_matrix()
print(matrix.shape)

(23, 257)


In [None]:
def compute_fbank(samples):
    samples = remove_dc_offset(samples)
    samples = preemphasize(samples)

    window = get_hann_window(samples.shape[0])
    samples = samples * window

    fft_bins = compute_fft(samples)
    power_spec = compute_power_spectrum(fft_bins)

    matrix = get_mel_filter_bank_matrix()

    f = np.matmul(matrix, power_spec.reshape(-1, 1)).squeeze(1)

    f = np.where(f == 0, np.finfo(float).eps, f)  # 避免np.log(0)

    return np.log(f)

In [None]:
feature_frame_0 = compute_fbank(frame_samples_0)
feature_frame_1 = compute_fbank(frame_samples_1)
print(feature_frame_0.shape)
print(feature_frame_1.shape)

(23,)
(23,)


In [None]:
opts = knf.FbankOptions()
opts.frame_opts.window_type = "hann"
opts.mel_opts.num_bins = 23
extractor = knf.OnlineFbank(opts)
extractor.accept_waveform(sample_rate, samples.tolist())
extractor.input_finished()

# (16000 - 400)//160 + 1
print("num_frames_ready", extractor.num_frames_ready)

f0 = extractor.get_frame(0)
f1 = extractor.get_frame(1)

num_frames_ready 98


In [None]:
print(np.abs(np.array(feature_frame_0) - np.array(f0)).max())
print(np.abs(np.array(feature_frame_1) - np.array(f1)).max())

0.01599148409710116
0.00430692115023712
