## 파일 불러오기

In [None]:
from scipy.io import wavfile
import numpy as np

train_audio_path = 'data/train/audio/'
filename = 'yes/0a7c2a8d_nohash_0.wav'
sample_rate, samples = wavfile.read(train_audio_path + filename)
print('sample rate : {}, samples.shape : {}'.format(sample_rate, samples.shape))

## 로그 스팩트럼 변환

In [5]:
from scipy import signal

def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
    # nperseg: Length of each segment
    # noverlap: Number of points to overlap between segments
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio, fs=sample_rate,
                                            window='hann', nperseg=nperseg,
                                            noverlap=noverlap, detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

SyntaxError: invalid non-printable character U+200B (Temp/ipykernel_10588/1637968178.py, line 3)

## 앰플리튜드 스팩트로그램 플롯 그리기

In [None]:
import matplotlib.pyplot as plt

freqs, times, spectrogram = log_specgram(samples, sample_rate)

fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of ' + filename)
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples), sample_rate), samples)

ax2 = fig.add_subplot(212)
ax2.imshow(spectrogram.T, aspect='auto', origin='lower',
           extent=[times.min(), times.max(), freqs.min(), freqs.max()])
ax2.set_yticks(freqs[::16])
ax2.set_xticks(times[::16])
ax2.set_title('Spectrogram of ' + filename)
ax2.set_ylabel('Freqs in Hz')
ax2.set_xlabel('Seconds')

## 정규분포로 노멀리 제이션

In [None]:


mean = np.mean(spectrogram, axis=0)
std = np.std(spectrogram, axis=0)
spectrogram = (spectrogram - mean) / std
spectrogram.shape

## 사운드가 인식되지 않은곳 확인

In [None]:
import IPython.display as ipd
import webrtcvad
vad = webrtcvad.Vad()
# 1~3 까지 설정 가능, 높을수록 aggressive
vad.set_mode(3)

class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration

def frame_generator(frame_duration_ms, audio, sample_rate):
    frames = []
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        frames.append(Frame(audio[offset:offset + n], timestamp, duration))
        timestamp += duration
        offset += n
    
    return frames

# 10, 20, or 30
frame_duration_ms = 10 # ms
frames = frame_generator(frame_duration, samples, sample_rate)
for i, frame in enumerate(frames):
    if not vad.is_speech(frame.bytes, sample_rate):
        print(i, end=' ')

## 자동으로 사운드 파일 내 Silence 를 자르기

In [None]:


def auto_vad(vad, samples, sample_rate, frame_duration_ms = 10):
    not_speech = []
    frames = frame_generator(frame_duration_ms, samples, sample_rate)
    n_frame = len(frames)
    for idx, frame in enumerate(frames):
        if not vad.is_speech(frame.bytes, sample_rate):
            not_speech.append(idx)
    prior = 0
    cutted_samples = []
    for i in not_speech:
        if i - prior > 2:
            start = int((float(prior) / n_frame) * sample_rate)
            end = int((float(i) / n_frame) * sample_rate)
            print(start, end)
            if len(cutted_samples) == 0:
                cutted_samples = samples[start:end]
            else:
                cutted_samples = np.append(cutted_samples, samples[start:end])
        prior = i
    return cutted_samples

In [None]:
cutted_samples = auto_vad(vad, samples, sample_rate, 10)
ipd.Audio(cutted_samples, rate=sample_rate)

## 데이터 증강

In [None]:
import numpy as np
import random
import itertools
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt

In [None]:
def plot_time_series(data):
    fig = plt.figure(figsize=(10, 4))
    plt.title('Raw wave ')
    plt.ylabel('Amplitude')
    plt.plot(np.linspace(0, 1, len(data)), data)
    plt.show()

data, sr = librosa.load('./input.wav', sr=22050)

print(data.shape)
plot_time_series(data)

#### White Noise

In [None]:
def adding_white_noise(data, sr=22050, noise_rate=0.005):
    # noise 방식으로 일반적으로 쓰는 잡음 끼게 하는 겁니다.
    wn = np.random.randn(len(data))
    data_wn = data + noise_rate*wn
    plot_time_series(data_wn)
    librosa.output.write_wav('./white_noise.wav', data, sr=sr) # 저장
    print('White Noise 저장 성공')
    
    return data

Shifting

In [None]:
def shifting_sound(data, sr=22050, roll_rate=0.1):
    # 그냥 [1, 2, 3, 4] 를 [4, 1, 2, 3]으로 만들어주는겁니다.
    data_roll = np.roll(data, int(len(data) * roll_rate))
    plot_time_series(data_roll)
    librosa.output.write_wav('./rolling_sound.wav', data, sr=sr)
    print('rolling_sound 저장 성공')
    
    return data

#### Stretching

In [None]:
def stretch_sound(data, sr=22050, rate=0.8):
    # stretch 해주는 것 테이프 늘어진 것처럼 들린다.
    stretch_data = librosa.effects.time_stretch(data, rate)
    plot_time_series(stretch_data)
    librosa.output.write_wav('./stretch_data.wav', stretch_data, sr=sr)
    print('stretch_data 저장 성공')
    
    return data

#### Reverse

In [None]:
def reverse_sound(data, sr=22050):
    # 거꾸로 재생
    data_len = len(data)
    data = np.array([data[len(data)-1-i] for i in range(len(data))])
    plot_time_series(data)
    librosa.output.write_wav('./reverse_data.wav', data, sr=sr)
    
    return data

#### Minus

In [None]:
def minus_sound(data, sr=22050):
    # 위상을 뒤집는 것으로서 원래 소리와 똑같이 들린다.
    temp_numpy = (-1)*data
    plot_time_series(temp_numpy)
    librosa.output.write_wav('./minus_data.wav', temp_numpy, sr=sr)
    
    return data