<div style="text-align: right"> - last read : 2019. 06. 3 </div>


## Sound Source Location (방위각 추정) and VAD as Multi-task learning
- Based on STFT magnitude and phase.  
- 2 Models: Convolutional GRU & 2D-CNN then 2D-CNN

### Model 시험 함수의 Inputs 형태 : 
- `predict_utterances(model_path, X, test_idx)` 
    - model_path : 테스트할 모델의 path
    - X : ndarray. 오디오 샘플(파일)들. 가령 50개의 오디오 파일이 있으면 X 의 길이는 50
      - 각 오디오 샘플은 1.16 초 길이의 ndarray 형태의 instance 들로 구성.  
      - 각 instance는 (512, 100, 4)의 shape를 갖음.  각 instance는 11.6 msec 간격.  
        - 512 : height of STFT.
        - 100 : number of frames.  Abut 1.16 sec duration
        - 4 : Four channels (left mag. left phase, right mag, right phase)
    - X[7] : 8번째 오디오 파일.  
    - X[7].shape == (17, 512, 100, 4) 이라면 : 8번째 오디오 파일이 17개의 instance로 구성.
- test_idx : numpy vector.  Indices of X to consider. 


### Model 시험 함수의 출력 : 
- test_idx 길이 만큼의 sample 들 내의 instance들에 대한 예측 

In [1]:
# Common imports
import numpy as np
import pandas as pd
import os, sys, glob  
import tensorflow as tf

import librosa
import librosa.display

# To plot pretty figures
# import matplotlib
# import matplotlib.pyplot as plt
# %matplotlib inline
# plt.style.use('ggplot')
# plt.rcParams['axes.labelsize'] = 14
# plt.rcParams['xtick.labelsize'] = 12
# plt.rcParams['ytick.labelsize'] = 12

def reset_graph(seed=42):
    tf.reset_default_graph() 
    tf.set_random_seed(seed)
    np.random.seed(seed)
    
def reset_keras_session(seed=42):
    tf.keras.backend.clear_session()
    tf.set_random_seed(seed)
    np.random.seed(seed)

import warnings
warnings.filterwarnings("ignore")   # To rid of warnings 

if sys.platform == 'win32':   # if windows 
    home = os.path.join('D:', os.sep, 'hblee')   # d:\hblee
    data_repo = os.path.join('D:', os.sep, 'Data_Repo_Win')   # d:\Data_Repo_Win
elif sys.platform == "linux" or sys.platform == "linux2" :    # if linux 
    home = os.path.expanduser("~")   # home = os.getenv("HOME")
    data_repo = os.path.join(home, 'Data_Repo')
    
sys.path.append(os.path.join(home, 'Google_Sync', 'Dev_Exercise', 'utils'))
from tf_utils import *
    
from tensorflow import keras 
keras.__version__, tf.VERSION

ModuleNotFoundError: No module named 'tf_utils'

In [None]:
'''
samples : audio samples(files).  50 of them 
samples_vad_seg : samples segmented as to voice region (1) and non-voice region (0) 

samples and samples_vad_seg should be aligned.  
'''

sample_data_repo = os.path.join('..', 'Data', 'sample_data', 't3_audio')
samples = glob.glob(os.path.join(sample_data_repo, '**', '*wav'), recursive=True)
samples = sorted(samples)   # sort the samples

sample_vad_seg_repo = os.path.join('..', 'Data', 'binary_segment')   # 적절하게 변경 필요 
samples_vad_seg = glob.glob(os.path.join(sample_vad_seg_repo, '**', '*[npy|npz]'), recursive=True)
samples_vad_seg = sorted(samples_vad_seg)   

#  Checking 
print('samples: ', len(samples), samples[25])
print('samples segmented: ', len(samples_vad_seg), samples_vad_seg[25])  

# data set 만들기

In [3]:
def mag_phase(file_path, sr=44100, n_fft=1024, hop_length=512, db=False, n_mels=50) :
    """
    stft의 magnitude와 phase 리턴
    """
    audio, sr = librosa.load(file_path, sr=sr, mono=False)   # 원래의 sr, stereo
    DL = librosa.stft(audio[0], n_fft=n_fft, hop_length=hop_length)
    DL_mag, DL_phase = librosa.magphase(DL)
    
    DR = librosa.stft(audio[1], n_fft=n_fft, hop_length=hop_length)
    DR_mag, DR_phase = librosa.magphase(DR)
    
    if db :
        DL_mag = librosa.core.amplitude_to_db(DL_mag)
        DR_mag = librosa.core.amplitude_to_db(DR_mag)
     
    # rescale the right magnitudes w.r.t left channel magnitude 
    avg = DL_mag.mean()     
    stdv = DL_mag.std()
    DL_mag = (DL_mag - avg)/stdv
    DR_mag = (DR_mag - avg)/stdv
    
    # return( (DL_mag, np.angle(DL_phase)), (DR_mag, np.angle(DR_phase)) )
    return( (DL_mag[1:, :], np.angle(DL_phase)[1:, :]), (DR_mag[1:, :], np.angle(DR_phase)[1:, :]) )

In [4]:
def generatio_tensor_instances(array_2d, dest_path, seq_len, hop, label):
    """
    array_2d : spectrogram.
    seq_len : number of frames in a instance
    label : 0 and 1's. The same length as original numpy vector 
    """
    row_size, col_size = array_2d.shape[0], array_2d.shape[1]
    ratio = len(label)/col_size  # ratio : how many data points per frame 
    stack_array = []    # 4D tensor that will hold the instances
    label_array = []

    j=0
    while j <= (col_size - (seq_len+1)): 
        context_frame = array_2d[:, j:(j+seq_len)]
        # seg_label = round( label[int(j*ratio):int((j+seq_len)*ratio)].mean() ) 
        threshold = 0.5  # if greater than the threshold, then speech 
        seg_label = 1 if label[int(j*ratio):int((j+seq_len)*ratio)].mean() > threshold else 0

#         # store the instances
#         dest_path_ext = ''.join([dest_path, '_', str(j)])
#         os.makedirs(os.path.dirname(dest_path_ext), exist_ok=True)

#         np.savez(dest_path_ext, spectrogram = context_frame,
#                  label=seg_label)
        
        stack_array.append(context_frame[:,:,np.newaxis])   # make context_frame to 3d tensor & append 
        label_array.append(seg_label)
            
        j = j+hop
        
    return np.stack(stack_array, axis=0), label_array

In [5]:
no_samples = len(samples) 

mag_L_instances = []    # elements are ndarrays
mag_R_instances = []
phase_L_instances = []
phase_R_instances = []
label_instances = []         # elements are lists

for i in range(0, no_samples):
    voice_noise_label = np.load(samples_vad_seg[i])
    if('npy' in samples_vad_seg[i].split('/')[-1]):
        label = voice_noise_label[0]        # use the left channel label.  this take care of 0 degree problem
    else:                                   # npz file
        label = voice_noise_label["label"]    
    (mag_L, phase_L), (mag_R, phase_R) = mag_phase(samples[i], db=True)
    
    # generate instances with 1.16 sec duration (100 frames), at every 0.116 sec apart (10 hops)
    voice_dest_path = os.path.join("mag", "Left", str(i))
    mag_L_instances_sub, _ = generatio_tensor_instances(mag_L, voice_dest_path, 100, 10, label)
    
    voice_dest_path = os.path.join("mag", "Right", str(i))
    mag_R_instances_sub, _ = generatio_tensor_instances(mag_R, voice_dest_path, 100, 10, label)
    
    voice_dest_path = os.path.join("phase", "Left", str(i))
    phase_L_instances_sub, _ = generatio_tensor_instances(phase_L, voice_dest_path, 100, 10, label)
    
    voice_dest_path = os.path.join("phase", "Right", str(i))
    phase_R_instances_sub, label_sub = generatio_tensor_instances(phase_R, voice_dest_path, 100, 10, label)
    
    mag_L_instances.append(mag_L_instances_sub)
    mag_R_instances.append(mag_R_instances_sub)
    phase_L_instances.append(phase_L_instances_sub)
    phase_R_instances.append(phase_R_instances_sub)
    
    label_instances.append(np.array(label_sub))
    

print(len(mag_L_instances), len(phase_R_instances), len(label_instances))

mag_L_instances[0].shape, phase_R_instances[0].shape, label_instances[0].shape
# the first sample produced 15 instances.  

50 50 50


((15, 512, 100, 1), (15, 512, 100, 1), (15,))

In [6]:
stacked_instances = []

for i in range(0, no_samples):
    concat_tensor = np.concatenate([mag_L_instances[i], phase_L_instances[i], 
                                    mag_R_instances[i], phase_R_instances[i]], axis = -1)
    stacked_instances.append(concat_tensor)
    
len(stacked_instances), stacked_instances[0].shape    # L, R magnitudes and phases are stacked.

(50, (15, 512, 100, 4))

#### noise와 voice 방향에 따라 labeling
- noise : 0                 
- 0도 : 1                
- 60도 : 2                
- 120도 : 3             
- 180도 : 4                  

In [7]:
import copy
vad_label_instances = copy.deepcopy(label_instances)

In [8]:
for i in range(12,25):
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 2
            
for i in range(25,38):
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 3
            
for i in range(38,50):
    for j in range(0, len(label_instances[i])):
        if(label_instances[i][j] == 1):
            label_instances[i][j] = 4
            
# label_instances

We have 822 instances. And we have labeled them into 5 classes.  Let's see how those labels are distributed.

- Note that `stacked_instances` indices has : 0~11(Class-1), 12~24(Class-2), 25~37(Class-3), 38~49(Class-4) and Class-0 is assigned to the noise  


In [73]:
X = np.array(stacked_instances)  # transform the list to ndarray

y = np.array(label_instances)

y_vad = np.array(vad_label_instances)

print(X.shape, y.shape, y_vad.shape )


(50,) (50,) (50,)


In [74]:
def predict_utterances(model_path, samples_instances, samples_indices) :
    """
    samples_instances : ndarray holding 'samples' number of sample representation in ndarrays. 
                        Each sample has the shape: (instances_in_sample, 512, 100, 4)
    samples_indices : indices of the samples to consider in 'samples_instances'
    """
    labels_pred = [] 
    
    model = keras.models.load_model(model_path)
    
    X = samples_instances[[samples_indices]]
    
    for i, sample in enumerate(X) :   # for the instances in each utterance sample 
        x_L = sample[:, :, :, :2]
        x_R = sample[:, :, :, 2:]
        
        labels_pred.append( np.argmax(model.predict([x_L, x_R]), axis=1) )
        
    return np.array(labels_pred)

In [75]:
models = sorted(glob.glob(os.path.join('.', 'models', '*.h5')))

In [76]:
idx = np.arange(0, no_samples)
# idx = np.random.permutation(no_samples)
# test_idx = idx[-10:]
test_idx = idx
test_idx

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])

In [19]:
predictions=predict_utterances(models[0], X, test_idx)  # test against the first model 

In [81]:
sample_index = 33
print(y[test_idx][sample_index].shape, predictions[sample_index].shape)
list(zip(y[test_idx][sample_index], predictions[sample_index]))

(25,) (25,)


[(3, 4),
 (3, 4),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 0),
 (3, 0),
 (3, 0),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 3),
 (3, 0)]

In [82]:
def class_instances_count(sample_instances_labels) : 
    """
    sample_instances_labels : class labels for the instances
    returns a list where elements are : (class_label, count)
    """
    # sample_instances_labels : class labels for the instances 
    import operator
    unique, counts = np.unique(sample_instances_labels, return_counts=True)
    dict_temp = dict(zip(unique, counts))
    return sorted(dict_temp.items(), key=operator.itemgetter(1), reverse=True)    

In [83]:
class_instances_count(predictions[sample_index])

[(3, 19), (0, 4), (4, 2)]

## Ensemble of models
- For each model, do (instance) predictions for all the sample files

In [131]:
ensemble_pred = predict_utterances(models[0], X, test_idx)

for model in models[1: ] :
    predictions=predict_utterances(model, X, test_idx)
    for sample_ind in range(0, len(test_idx)):
        ensemble_pred[sample_ind] = np.concatenate([ensemble_pred[sample_ind], predictions[sample_ind]], 
                                                   axis=-1)

0~11(Class-1), 12~24(Class-2), 25~37(Class-3), 38~49(Class-4) and Class-0 is assigned to the noise

In [148]:
ensemble_predictions = []

for instances_labels in ensemble_pred :
    for class_label, class_count in class_instances_count(instances_labels) :
        if class_label != 0 :
            ensemble_predictions.append(class_label)
            break
        else :
            continue
            

list(zip(range(0, 50), ensemble_predictions))    

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 2),
 (13, 2),
 (14, 2),
 (15, 2),
 (16, 2),
 (17, 2),
 (18, 2),
 (19, 2),
 (20, 2),
 (21, 2),
 (22, 2),
 (23, 2),
 (24, 2),
 (25, 3),
 (26, 3),
 (27, 3),
 (28, 3),
 (29, 3),
 (30, 3),
 (31, 3),
 (32, 3),
 (33, 3),
 (34, 3),
 (35, 3),
 (36, 3),
 (37, 3),
 (38, 4),
 (39, 4),
 (40, 4),
 (41, 4),
 (42, 4),
 (43, 4),
 (44, 4),
 (45, 4),
 (46, 4),
 (47, 4),
 (48, 4),
 (49, 4)]