In [35]:
import librosa
import numpy as np
import tensorflow as tf

In [31]:
y, sr = librosa.load("./data/Chicken/Chicken (1).wav")
mfcc = librosa.feature.mfcc(y=y, sr=sr)
mfcc

array([[-2.57598235e+02, -2.19888944e+02, -2.04487532e+02,
        -1.79927271e+02, -1.72124621e+02, -2.09054879e+02,
        -3.03378655e+02, -2.30006406e+02, -1.88592321e+02,
        -1.99354451e+02, -1.74777213e+02, -1.44819457e+02,
        -1.36620498e+02, -1.39215623e+02, -1.40550651e+02,
        -1.36541098e+02, -1.34833889e+02, -1.46329146e+02,
        -1.85799881e+02, -1.69483010e+02, -1.41947614e+02,
        -1.34602353e+02, -1.35321516e+02, -1.34221764e+02,
        -1.30133663e+02, -1.32107087e+02, -1.41659476e+02,
        -1.47893133e+02, -1.45773224e+02, -1.41618445e+02,
        -1.41033936e+02, -1.43148439e+02, -1.50935455e+02,
        -1.55518402e+02, -1.70351541e+02, -1.77251489e+02,
        -1.60863458e+02, -1.76936150e+02, -1.72643285e+02,
        -1.60582259e+02, -1.83023550e+02, -1.95645592e+02,
        -1.99216537e+02, -2.37574411e+02],
       [-5.27355876e+01, -5.84191471e+01, -6.16199599e+01,
        -6.96036903e+01, -7.42125339e+01, -6.97208792e+01,
        -4.00

In [56]:
tf.reset_default_graph()
#########
# 옵션 설정
######
learning_rate = 0.001
total_epoch = 30
batch_size = 1

# RNN 은 순서가 있는 자료를 다루므로,
# 한 번에 입력받는 갯수와, 총 몇 단계로 이루어져있는 데이터를 받을지를 설정해야합니다.
# 이를 위해 가로 픽셀수를 n_input 으로, 세로 픽셀수를 입력 단계인 n_step 으로 설정하였습니다.
n_input = mfcc.shape[1]
n_step = mfcc.shape[0]
n_hidden = 128
n_class = 4

#########
# 신경망 모델 구성
######
X = tf.placeholder(tf.float32, [None, n_step, n_input])
Y = tf.placeholder(tf.float32, [None, n_class])

W = tf.Variable(tf.random_normal([n_hidden, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

# RNN 에 학습에 사용할 셀을 생성합니다
# 다음 함수들을 사용하면 다른 구조의 셀로 간단하게 변경할 수 있습니다
# BasicRNNCell,BasicLSTMCell,GRUCell
cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)

# RNN 신경망을 생성합니다
# 원래는 다음과 같은 과정을 거쳐야 하지만
# states = tf.zeros(batch_size)
# for i in range(n_step):
#     outputs, states = cell(X[[:, i]], states)
# ...
# 다음처럼 tf.nn.dynamic_rnn 함수를 사용하면
# CNN 의 tf.nn.conv2d 함수처럼 간단하게 RNN 신경망을 만들어줍니다.
# 겁나 매직!!
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

# 결과를 Y의 다음 형식과 바꿔야 하기 때문에
# Y : [batch_size, n_class]
# outputs 의 형태를 이에 맞춰 변경해야합니다.
# outputs : [batch_size, n_step, n_hidden]
#        -> [n_step, batch_size, n_hidden]
#        -> [batch_size, n_hidden]
outputs = tf.transpose(outputs, [1, 0, 2])
outputs = outputs[-1]
model = tf.matmul(outputs, W) + b

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

#########
# 신경망 모델 학습
######
sess = tf.Session()
sess.run(tf.global_variables_initializer())

total_batch = 1 #int(mnist.train.num_examples/batch_size)

for epoch in range(total_epoch):
    total_cost = 0

    for i in range(total_batch):
        batch_xs, batch_ys = (mfcc, np.array([[1, 0, 0, 0]], dtype='float32'))
        # X 데이터를 RNN 입력 데이터에 맞게 [batch_size, n_step, n_input] 형태로 변환합니다.
        batch_xs = batch_xs.reshape((batch_size, n_step, n_input))

        _, cost_val = sess.run([optimizer, cost],
                               feed_dict={X: batch_xs, Y: batch_ys})
        total_cost += cost_val

    print('Epoch:', '%04d' % (epoch + 1),
          'Avg. cost =', '{:f}'.format(total_cost / total_batch))

print('최적화 완료!')

#########
# 결과 확인
######
sess.run(model, feed_dict={X:np.array([mfcc]), Y:np.array([[1, 0, 0, 0]], dtype='float32')})
# is_correct = tf.equal(tf.argmax(model, 1), tf.argmax(Y, 1))
# accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))



Epoch: 0001 Avg. cost = 21.324923
Epoch: 0002 Avg. cost = 10.224458
Epoch: 0003 Avg. cost = 2.260748
Epoch: 0004 Avg. cost = 0.013520
Epoch: 0005 Avg. cost = 0.000075
Epoch: 0006 Avg. cost = 0.000002
Epoch: 0007 Avg. cost = 0.000000
Epoch: 0008 Avg. cost = 0.000000
Epoch: 0009 Avg. cost = 0.000000
Epoch: 0010 Avg. cost = 0.000000
Epoch: 0011 Avg. cost = 0.000000
Epoch: 0012 Avg. cost = 0.000000
Epoch: 0013 Avg. cost = 0.000000
Epoch: 0014 Avg. cost = 0.000000
Epoch: 0015 Avg. cost = 0.000000
Epoch: 0016 Avg. cost = 0.000000
Epoch: 0017 Avg. cost = 0.000000
Epoch: 0018 Avg. cost = 0.000000
Epoch: 0019 Avg. cost = 0.000000
Epoch: 0020 Avg. cost = 0.000000
Epoch: 0021 Avg. cost = 0.000000
Epoch: 0022 Avg. cost = 0.000000
Epoch: 0023 Avg. cost = 0.000000
Epoch: 0024 Avg. cost = 0.000000
Epoch: 0025 Avg. cost = 0.000000
Epoch: 0026 Avg. cost = 0.000000
Epoch: 0027 Avg. cost = 0.000000
Epoch: 0028 Avg. cost = 0.000000
Epoch: 0029 Avg. cost = 0.000000
Epoch: 0030 Avg. cost = 0.000000
최적화 완료!


array([[ 16.465954,  -8.166804, -13.614468, -14.230664]], dtype=float32)

# 모델 테스트

In [61]:
y, sr = librosa.load("./data/Chicken/Chicken (1).wav")
test_mfcc = librosa.feature.mfcc(y=y, sr=sr)

test_batch_size = 1
test_xs = test_mfcc.reshape(1, n_step, n_input)

sess.run(model, feed_dict={X:np.array([test_mfcc]), Y:np.array([[1, 0, 0, 0]], dtype='float32')})

array([[ 16.465954,  -8.166804, -13.614468, -14.230664]], dtype=float32)