# prj :  Spectrogram classification 모델 구현

- 2차원 데이터 받아 
- 기본버전과 skip connection 버전 모델 실습


## import library

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt

import IPython.display as ipd
import random

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras import layers

import librosa

## data load

In [2]:
data_path = os.getenv("HOME")+'/aiffel/AIFFEL_LSG/utill/speech_wav_8000.npz'
speech_data = np.load(data_path)

In [3]:
print("Wave data shape : ", speech_data["wav_vals"].shape)
print("Label data shape : ", speech_data["label_vals"].shape)
print("✅")

Wave data shape :  (50620, 8000)
Label data shape :  (50620, 1)
✅


In [31]:
# 데이터 선택
rand = random.randint(0, len(speech_data["wav_vals"]))
print("rand num : ", rand)

sr = 8000 # 1초동안 재생되는 샘플의 갯수
data = speech_data["wav_vals"][rand]
print("Wave data shape : ", data.shape)
print("label : ", speech_data["label_vals"][rand])

rand num :  25714
Wave data shape :  (8000,)
label :  ['left']


## 데이터 처리와 분류

###   2차원 Spectrogram 변형`

In [19]:
def wav2spec(wav, fft_size=258): # spectrogram shape을 맞추기위해서 size 변형
    D = np.abs(librosa.stft(wav, n_fft=fft_size))
    return D

In [21]:
spec = wav2spec(data)
print("Waveform shape : ",data.shape)
print("Spectrogram shape : ",spec.shape)

Waveform shape :  (8000,)
Spectrogram shape :  (130, 126)


In [36]:
speech_data["wav_vals"].shape

(50620, 8000)

In [28]:

for data in speech_data["wav_vals"]:
    data[1], data[2] = wav2spec(data[1])




In [30]:
type(spec_data)

tuple

### label data 처리

In [8]:
target_list = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

label_value = target_list
label_value.append('unknown')
label_value.append('silence')

new_label_value = dict()
for i, l in enumerate(label_value):
    new_label_value[l] = i
label_value = new_label_value

In [9]:
temp = []

for v in speech_data["label_vals"]:
    temp.append(label_value[v[0]])
label_data = np.array(temp)


In [10]:
label_value

{'yes': 0,
 'no': 1,
 'up': 2,
 'down': 3,
 'left': 4,
 'right': 5,
 'on': 6,
 'off': 7,
 'stop': 8,
 'go': 9,
 'unknown': 10,
 'silence': 11}

### data set 분리

In [11]:
from sklearn.model_selection import train_test_split

sr = 8000
train_wav, test_wav, train_label, test_label = train_test_split(speech_data["wav_vals"], 
                                                                label_data, 
                                                                test_size=0.1,
                                                                shuffle=True)
print(train_wav)



[[ 1.0980472e-03  2.8733537e-04 -1.0553877e-03 ... -2.0539344e-03
  -1.0159509e-03 -6.5056859e-03]
 [ 4.0703264e-05  8.0593927e-05  1.5963767e-04 ... -2.2723171e-04
  -2.1236257e-04 -9.1046393e-05]
 [-6.5024084e-05 -9.9140612e-05 -5.0194823e-05 ... -1.7721154e-04
  -9.7956145e-05 -6.1955427e-05]
 ...
 [ 9.6120086e-05  1.3044180e-04  8.1272890e-05 ... -7.2657538e-04
  -3.7950958e-04 -1.4070658e-03]
 [ 2.0794477e-04  6.3185429e-04  5.6122569e-04 ...  8.2277751e-04
   5.7218206e-04  6.1433262e-04]
 [-1.0141632e-02 -2.6850286e-03 -1.6197972e-02 ...  1.8604577e-02
   1.8603582e-02  1.0448390e-02]]


In [12]:
train_wav = train_wav.reshape([-1, sr, 1]) # add channel for CNN
test_wav = test_wav.reshape([-1, sr, 1])
print("✅")

✅


In [19]:
print("train data : ", train_wav.shape)
print("train labels : ", train_label.shape)
print("test data : ", test_wav.shape)
print("test labels : ", test_label.shape)
print("✅")

train data :  (45558, 8000)
train labels :  (45558,)
test data :  (5062, 8000)
test labels :  (5062,)
✅


In [20]:
spec

array([[1.16182379e-02, 5.26165823e-03, 2.67291069e-03, ...,
        1.45228738e-02, 7.73314834e-02, 1.02990456e-01],
       [1.31122330e-02, 7.66591122e-03, 6.99189631e-03, ...,
        6.44308031e-02, 5.28991707e-02, 6.11193180e-02],
       [6.47677993e-03, 1.29101723e-02, 1.57861914e-02, ...,
        2.88257431e-02, 6.11017924e-03, 7.61824055e-03],
       ...,
       [5.14591229e-04, 3.21652158e-04, 8.42790687e-05, ...,
        1.31511217e-04, 4.84432036e-04, 1.00904074e-03],
       [3.37703910e-04, 1.65379184e-04, 5.02230323e-05, ...,
        1.67718190e-05, 4.97801579e-04, 9.31385206e-04],
       [2.41642047e-04, 1.13796814e-04, 5.46092042e-06, ...,
        5.32758804e-06, 4.79746843e-04, 9.48845642e-04]], dtype=float32)

## 학습 하이퍼 파리미터 설정

In [14]:
batch_size = 128
max_epochs = 10

# the save point
checkpoint_dir = os.getenv('HOME')+'/aiffel/AIFFEL_LSG/utill/speech_recognition/checkpoint/wav-spec'

checkpoint_dir

'/home/aiffel0042/aiffel/AIFFEL_LSG/utill/speech_recognition/checkpoint/wav-spec'

## dataset 구성

In [None]:
def one_hot_label(wav, label):
    label = tf.one_hot(label, depth=12)
    return wav, label
print("✅")

In [None]:
# for train
train_dataset = tf.data.Dataset.from_tensor_slices((train_wav, train_label))
train_dataset = train_dataset.map(one_hot_label)
train_dataset = train_dataset.repeat().batch(batch_size=batch_size)
print(train_dataset)

# for test
test_dataset = tf.data.Dataset.from_tensor_slices((test_wav, test_label))
test_dataset = test_dataset.map(one_hot_label)
test_dataset = test_dataset.batch(batch_size=batch_size)
print(test_dataset)
print("✅")

##  model 

- 2차원 Spectrogram 데이터의 시간축 방향으로 Conv1D layer를 적용, 혹은 Conv2D layer를 적용 가능
- batchnorm, dropout, dense layer 등을 이용
- 12개의 단어 class를 구분하는 loss를 사용하고 Adam optimizer를 사용
- 모델 가중치를 저장하는 checkpoint callback 함수 추가


### 일반 모델

In [16]:
from tensorflow.keras import layers

input_tensor = layers.Input(shape=(sr, 1)) ## TODO

x = layers.Conv1D(32, 9, padding='same', activation='relu')(input_tensor)
x = layers.Conv1D(32, 9, padding='same', activation='relu')(x)
x = layers.MaxPool1D()(x)

x = layers.Conv1D(64, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(64, 9, padding='same', activation='relu')(x)
x = layers.MaxPool1D()(x)

x = layers.Conv1D(128, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(128, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(128, 9, padding='same', activation='relu')(x)
x = layers.MaxPool1D()(x)

x = layers.Conv1D(256, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(256, 9, padding='same', activation='relu')(x)
x = layers.Conv1D(256, 9, padding='same', activation='relu')(x)
x = layers.MaxPool1D()(x)
x = layers.Dropout(0.3)(x)

x = layers.Flatten()(x)
x = layers.Dense(256)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)

output_tensor = layers.Dense(12)(x)

model_wav = tf.keras.Model(input_tensor, output_tensor)

model_wav.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8000, 1)]         0         
_________________________________________________________________
conv1d (Conv1D)              (None, 8000, 32)          320       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 8000, 32)          9248      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 4000, 32)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 4000, 64)          18496     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 4000, 64)          36928     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2000, 64)         

In [None]:
##### loss #######
optimizer=tf.keras.optimizers.Adam(1e-4)
model_wav.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
             optimizer=optimizer,
             metrics=['accuracy'])
print("✅")

In [None]:
## train

# check point 저장
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_dir,
                                                 save_weights_only=True,
                                                 monitor='val_loss',
                                                 mode='auto',
                                                 save_best_only=True,


#30분 내외 소요
history_wav = model_wav.fit(train_dataset, epochs=max_epochs,
                    steps_per_epoch=len(train_wav) // batch_size,
                    validation_data=test_dataset,
                    validation_steps=len(test_wav) // batch_size,
                    callbacks=[cp_callback]
                    )




## 학습 그래프 출력

## 성능평가