In [18]:
import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa
import librosa.display
import numpy as np

In [19]:
### .pcm 파일을 .wav 파일로 바꾸는 함수
import wave

# The parameters are prerequisite information. More specifically,
# channels, bit_depth, sampling_rate must be known to use this function.
def pcm2wav( pcm_file, wav_file, channels=1, bit_depth=16, sampling_rate=16000 ):

    # Check if the options are valid.
    if bit_depth % 8 != 0:
        raise ValueError("bit_depth "+str(bit_depth)+" must be a multiple of 8.")
        
    # Read the .pcm file as a binary file and store the data to pcm_data
    with open( pcm_file, 'rb') as opened_pcm_file:
        pcm_data = opened_pcm_file.read();
        
        obj2write = wave.open( wav_file, 'wb')
        obj2write.setnchannels( channels )
        obj2write.setsampwidth( bit_depth // 8 )
        obj2write.setframerate( sampling_rate )
        obj2write.writeframes( pcm_data )
        obj2write.close()

In [49]:
def one_sec_cutter(path):
    raw_npy = np.empty((0, 16000), np.float32)
    mfcc_npy = np.empty((0, 20, 32), np.float32) 
    
    x, Fs = librosa.load(path, sr=16000)
    
    num_1sec = len(x) // Fs
    for j in range(num_1sec):
        start = Fs * j
        end = Fs * (j + 1)

        cutter_1sec = x[start:end]
        mfcc = librosa.feature.mfcc(y=cutter_1sec)

        cutter_1sec = cutter_1sec.reshape((1,) + cutter_1sec.shape)
        mfcc = mfcc.reshape((1,) + mfcc.shape)

        raw_npy = np.concatenate((raw_npy, cutter_1sec), axis=0)
        mfcc_npy = np.concatenate((mfcc_npy, mfcc), axis=0)
    
    return raw_npy, mfcc_npy

In [4]:
### 새로 받은 한국어 음성 데이터 1번부터 1000번까지 wav 파일로 변환
for i in range(1, 1001):
    pcm_path = './audio_data/KsponSpeech/KsponSpeech_0001/KsponSpeech_' + '{0:06d}'.format(i) + '.pcm'
    wav_path = './audio_data/Kspon_wav/KsponSpeech_wav_0001/Kspon_wav_' + '{0:06d}'.format(i) + '.wav'
    pcm2wav( pcm_path, wav_path, 1, 16, 16000 )

In [50]:
kspon_wav_npy = np.empty((0, 16000), np.float32)
kspon_mfcc_npy = np.empty((0, 20, 32), np.float32)

for i in range(1, 1001):
    wav_path = './audio_data/Kspon_wav/KsponSpeech_wav_0001/Kspon_wav_' + '{0:06d}'.format(i) + '.wav'
    
    raw_npy, mfcc_npy = one_sec_cutter(wav_path)
    kspon_wav_npy = np.concatenate((kspon_wav_npy, raw_npy), axis=0)
    kspon_mfcc_npy = np.concatenate((kspon_mfcc_npy, mfcc_npy), axis=0)

In [51]:
print('shape of kspon_wav_npy:', kspon_wav_npy.shape)
print('shape of kspon_mfcc_npy:', kspon_mfcc_npy.shape)

shape of kspon_wav_npy: (4993, 16000)
shape of kspon_mfcc_npy: (4993, 20, 32)


In [69]:
np.save('./audio_data/npy_data/Kspon_wav_npy.npy', kspon_wav_npy)
np.save('./audio_data/npy_data/kspon_mfcc_npy.npy', kspon_mfcc_npy)

In [66]:
noise_path = 'audio_data/noise_mp3/noise_window.mp3'

noise_mp3_npy, noise_mfcc_npy= one_sec_cutter(noise_path)

In [67]:
print('shape of noise_mp3_npy:', noise_mp3_npy.shape)
print('shape of noise_mfcc_npy:', noise_mfcc_npy.shape)

shape of noise_mp3_npy: (1201, 16000)
shape of noise_mfcc_npy: (1201, 20, 32)


In [70]:
np.save('./audio_data/npy_data/noise_window_mp3_npy.npy', kspon_wav_npy)
np.save('./audio_data/npy_data/noise_window_mfcc_npy.npy', kspon_mfcc_npy)