# Project 1 Coding Experiments

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.fftpack import dct, idct
import scipy.signal
import scipy.io.wavfile as wav
import os

## TODO 2-6: `framing`

frame level length and shift

In [2]:
frame_len = int(round(16000 * 25 / 1000.0))
frame_shift = int(round(16000 * 10 / 1000.0))
print("frame_len: ", frame_len, type(frame_len))
print("frame_shift: ", frame_shift, type(frame_shift))

frame_len:  400 <class 'int'>
frame_shift:  160 <class 'int'>


padding function

In [3]:
signal = np.random.randn(3)
pad_len = 5

pad_signal = np.pad(signal, (0, pad_len), mode="constant")
print(pad_signal)

[ 0.67549024 -1.11976324  0.38705708  0.          0.          0.
  0.          0.        ]


`frames = pad_signal[indices]`

我要一个和 `indices` 同形状的数组；其中每个位置 `(i, j)` 的值等于 `pad_signal[ indices[i, j] ]`。

In [4]:
pad_signal = np.array([10, 11, 12, 13, 14, 15])
indices = np.array([
    [0, 1, 2],   # 第0帧取 pad_signal[0:3]
    [2, 3, 4],   # 第1帧取 pad_signal[2:5]
    [3, 4, 5],   # 第2帧取 pad_signal[3:6]
])
frames = pad_signal[indices]
print(frames)

[[10 11 12]
 [12 13 14]
 [13 14 15]]


当你用一个整型数组去索引另一个数组时，NumPy 不再把它当成“切片范围”，而是把它当成一张“取样坐标表”，按坐标表把元素逐个取出来，并按索引数组的形状重组。本质上等价于：

```python
frames = np.zeros((num_frames, frame_len), dtype=pad_signal.dtype)
 for i in range(num_frames):
     for j in range(frame_len):
         frames[i, j] = pad_signal[ indices[i, j] ]
```

### Toy Model

In [5]:
sample_rate = 16000
frame_length_ms = 25
frame_shift_ms = 10

signal = np.random.randn(3)

print("original signal: ", signal)
signal = signal.astype(np.float32, copy=False) # By converting to float32 type, we can avoid potential issues with integer overflow
print("signal for input: ", signal)

original signal:  [ 2.0933675  -0.27234337 -1.44468404]
signal for input:  [ 2.0933676  -0.27234337 -1.444684  ]


In [6]:
# TODO 2: Calculate frame length and frame shift in samples
print("- TODO 2 Inputs -")
print("sample_rate: ", sample_rate)
print("frame_length_ms: ", frame_length_ms)
print("frame_shift_ms: ", frame_shift_ms)
frame_len = int(round(sample_rate * frame_length_ms / 1000.0))  # frame_len = sr * 25ms
frame_shift = int(round(sample_rate * frame_shift_ms / 1000.0))  # frame_shift = sr * 10ms
frame_len = max(1, frame_len) # ensures that the frame length is at least 1 sample
frame_shift = max(1, frame_shift) # ensures that the frame shift is at least 1 sample
print("- TODO 2 Outputs -")
print("frame_len: ", frame_len)
print("frame_shift: ", frame_shift)

- TODO 2 Inputs -
sample_rate:  16000
frame_length_ms:  25
frame_shift_ms:  10
- TODO 2 Outputs -
frame_len:  400
frame_shift:  160


In [7]:
# TODO 3: Calculate total number of frames
print("- TODO 3 Inputs -")
print("signal: ", signal)
print("frame_len: ", frame_len)
print("frame_shift: ", frame_shift)
sig_len = len(signal)
if sig_len <= frame_len:
    num_frames = 1
else:
    num_frames = 1 + int(np.ceil((sig_len - frame_len) / frame_shift))
print("- TODO 3 Outputs -")
print("num_frames: ", num_frames)

- TODO 3 Inputs -
signal:  [ 2.0933676  -0.27234337 -1.444684  ]
frame_len:  400
frame_shift:  160
- TODO 3 Outputs -
num_frames:  1


In [8]:
# TODO 4: Pad signal to fit integer number of frames
print("- TODO 4 Inputs -")
print("signal: ", signal)
print("sig_len: ", sig_len)
print("num_frames: ", num_frames)
print("frame_len: ", frame_len)
print("frame_shift: ", frame_shift)
target_len = (num_frames - 1) * frame_shift + frame_len
pad_len = target_len - sig_len
if pad_len > 0:
    pad_signal = np.pad(signal, (0, pad_len), mode="constant")  # signal前不做padding，signal后做`pad_len`个0的padding。cosntant表示用常数0填充。
else:
    pad_signal = signal
print("- TODO 4 Outputs -")
print("pad_signal: ", pad_signal)
print("pad_len: ", pad_len)
print("target_len: ", target_len)
print("signal length after padding: ", len(pad_signal))

- TODO 4 Inputs -
signal:  [ 2.0933676  -0.27234337 -1.444684  ]
sig_len:  3
num_frames:  1
frame_len:  400
frame_shift:  160
- TODO 4 Outputs -
pad_signal:  [ 2.0933676  -0.27234337 -1.444684    0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          

In [9]:
# TODO 5: Create frame indices matrix
print("- TODO 5 Inputs -")
print("num_frames: ", num_frames)
print("frame_len: ", frame_len)
print("frame_shift: ", frame_shift)
indices = np.zeros((num_frames, frame_len), dtype=int)
for i in range(num_frames):
    start_idx = i * frame_shift
    indices[i] = np.arange(start_idx, start_idx + frame_len)  # indices[i] 是第 i 帧对应的采样点下标区间 [i*shift, i*shift+frame_len)
# 这样可以用一次 numpy 索引把所有帧都取出来（pad_signal[indices]），逻辑清晰，后续 window/FFT 都能批处理。
print("- TODO 5 Outputs -")
print("indices: ", indices)
print("indices shape: ", indices.shape)

- TODO 5 Inputs -
num_frames:  1
frame_len:  400
frame_shift:  160
- TODO 5 Outputs -
indices:  [[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
   18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
   36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
   54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
   72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
   90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
  108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
  126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
  144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
  162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
  180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
  198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
  216 217 218 21

In [10]:
# TODO 6: Extract frames using indices
frames = pad_signal[indices]  # 得到 shape (num_frames, frame_len) 的矩阵
# 后面的 windowing、功率谱、Mel 滤波器组等都按“每一帧一行”来算，矩阵化最方便。
print("- TODO 6 Outputs -")
print("frames: ", frames)
print("frames shape: ", frames.shape)

- TODO 6 Outputs -
frames:  [[ 2.0933676  -0.27234337 -1.444684    0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.       

## TODO 7: `windowing`

In [None]:
# TODO 7: Create Hamming window and apply to each frame (refers to PPT slide 32 to write Hamming window)
M = frames.shape[1]
n = np.arange(M)
print("M: ", M)
hamming_window = 0.54 - 0.46 * np.cos(2.0 * np.pi * n / (M-1))
windowed = frames * hamming_window[None, :]
print("hamming_window: ", hamming_window[:5], hamming_window[-5:])
print("hamming_window shape: ", hamming_window.shape)
print("windowed: ", windowed)
print("windowed shape: ", windowed.shape)

M:  400
hamming_window:  [0.08       0.08005703 0.08022812 0.08051322 0.08091226] [0.08091226 0.08051322 0.08022812 0.08005703 0.08      ]
hamming_window shape:  (400,)
windowed:  [[ 0.16746941 -0.021803   -0.11590429  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0

What if we use M instead of M-1?

In [12]:
# TODO 7: Create Hamming window and apply to each frame (refers to PPT slide 32 to write Hamming window)
M = frames.shape[1]
n = np.arange(M)
print("M: ", M)
hamming_window = 0.54 - 0.46 * np.cos(2.0 * np.pi * n / (M))
windowed = frames * hamming_window[None, :]
print("hamming_window: ", hamming_window[:5], hamming_window[-5:])
print("hamming_window shape: ", hamming_window.shape)
print("windowed: ", windowed)
print("windowed shape: ", windowed.shape)

M:  400
hamming_window:  [0.08       0.08005675 0.08022698 0.08051066 0.0809077 ] [0.08141803 0.0809077  0.08051066 0.08022698 0.08005675]
hamming_window shape:  (400,)
windowed:  [[ 0.16746941 -0.02180292 -0.11590264  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0