# Predict

In [1]:
# Common imports
import numpy as np
import pandas as pd
import os, sys, glob  
import tensorflow as tf

import librosa
import librosa.display

# To plot pretty figures
# import matplotlib
# import matplotlib.pyplot as plt
# %matplotlib inline
# plt.style.use('ggplot')
# plt.rcParams['axes.labelsize'] = 14
# plt.rcParams['xtick.labelsize'] = 12
# plt.rcParams['ytick.labelsize'] = 12

def reset_graph(seed=42):
    tf.reset_default_graph() 
    tf.set_random_seed(seed)
    np.random.seed(seed)
    
def reset_keras_session(seed=42):
    tf.keras.backend.clear_session()
    tf.set_random_seed(seed)
    np.random.seed(seed)

import warnings
warnings.filterwarnings("ignore")   # To rid of warnings 

if sys.platform == 'win32':   # if windows 
    home = os.path.join('D:', os.sep, 'hblee')   # d:\hblee
    data_repo = os.path.join('D:', os.sep, 'Data_Repo_Win')   # d:\Data_Repo_Win
elif sys.platform == "linux" or sys.platform == "linux2" :    # if linux 
    home = os.path.expanduser("~")   # home = os.getenv("HOME")
    data_repo = os.path.join(home, 'Data_Repo')
    
#sys.path.append(os.path.join(home, 'Google_Sync', 'Dev_Exercise', 'utils'))
#from tf_utils import *
    
from tensorflow import keras 
keras.__version__, tf.VERSION

('2.2.4-tf', '1.13.1')

In [3]:
'''
samples : audio samples(files).  50 of them 
samples_vad_seg : samples segmented as to voice region (1) and non-voice region (0) 

samples and samples_vad_seg should be aligned.  
'''

sample_data_repo = os.path.join(home, 'Downloads', 'audio')
samples = glob.glob(os.path.join(sample_data_repo, '**', '*wav'), recursive=True)
samples = sorted(samples)   # sort the samples

sample_vad_seg_repo = os.path.join(home, 'Downloads', 'binary_segment')   # 적절하게 변경 필요 
samples_vad_seg = glob.glob(os.path.join(sample_vad_seg_repo, '**', '*[npy|npz]'), recursive=True)
samples_vad_seg = sorted(samples_vad_seg)   

#  Checking 
print('samples: ', len(samples), samples[25])
print('samples segmented: ', len(samples_vad_seg), samples_vad_seg[25])    

samples:  315 /home/user4/Downloads/audio/100도/output31.wav
samples segmented:  315 /home/user4/Downloads/binary_segment/100도/output31.npz


In [4]:
def mag_phase(file_path, sr=48000, n_fft=1024, hop_length=512, db=False, n_mels=50) :
    """
    stft의 magnitude와 phase 리턴
    """
    audio, sr = librosa.load(file_path, sr=sr, mono=False)   # 원래의 sr, stereo
    DL = librosa.stft(audio[0], n_fft=n_fft, hop_length=hop_length)
    DL_mag, DL_phase = librosa.magphase(DL)
    
    DR = librosa.stft(audio[1], n_fft=n_fft, hop_length=hop_length)
    DR_mag, DR_phase = librosa.magphase(DR)
    
    if db :
        DL_mag = librosa.core.amplitude_to_db(DL_mag)
        DR_mag = librosa.core.amplitude_to_db(DR_mag)
     
    # rescale the right magnitudes w.r.t left channel magnitude 
    avg = DL_mag.mean()     
    stdv = DL_mag.std()
    DL_mag = (DL_mag - avg)/stdv
    DR_mag = (DR_mag - avg)/stdv
    
    # return( (DL_mag, np.angle(DL_phase)), (DR_mag, np.angle(DR_phase)) )
    return( (DL_mag[1:, :], np.angle(DL_phase)[1:, :]), (DR_mag[1:, :], np.angle(DR_phase)[1:, :]) )

In [5]:
def generatio_tensor_instances(array_2d, dest_path, seq_len, hop, label):
    """
    array_2d : spectrogram.
    seq_len : number of frames in a instance
    label : 0 and 1's. The same length as original numpy vector 
    """
    row_size, col_size = array_2d.shape[0], array_2d.shape[1]
    ratio = len(label)/col_size  # ratio : how many data points per frame 
    stack_array = []    # 4D tensor that will hold the instances
    label_array = []

    j=0
    while j <= (col_size - (seq_len+1)): 
        context_frame = array_2d[:, j:(j+seq_len)]
        # seg_label = round( label[int(j*ratio):int((j+seq_len)*ratio)].mean() ) 
        threshold = 0.5  # if greater than the threshold, then speech 
        seg_label = 1 if label[int(j*ratio):int((j+seq_len)*ratio)].mean() > threshold else 0

#         # store the instances
#         dest_path_ext = ''.join([dest_path, '_', str(j)])
#         os.makedirs(os.path.dirname(dest_path_ext), exist_ok=True)

#         np.savez(dest_path_ext, spectrogram = context_frame,
#                  label=seg_label)
        
        stack_array.append(context_frame[:,:,np.newaxis])   # make context_frame to 3d tensor & append 
        label_array.append(seg_label)
            
        j = j+hop
        
    return np.stack(stack_array, axis=0), label_array

In [6]:
no_samples = len(samples) 

mag_L_instances = []    # elements are ndarrays
mag_R_instances = []
phase_L_instances = []
phase_R_instances = []
label_instances = []         # elements are lists

for i in range(0, no_samples):
    voice_noise_label = np.load(samples_vad_seg[i])
    if('npy' in samples_vad_seg[i].split('/')[-1]):
        label = voice_noise_label[0]        # use the left channel label.  this take care of 0 degree problem
    else:                                   # npz file
        label = voice_noise_label["label"]    
    (mag_L, phase_L), (mag_R, phase_R) = mag_phase(samples[i], db=True)
    
    # generate instances with 1.16 sec duration (100 frames), at every 0.116 sec apart (10 hops)
    voice_dest_path = os.path.join("mag", "Left", str(i))
    mag_L_instances_sub, _ = generatio_tensor_instances(mag_L, voice_dest_path, 100, 10, label)
    
    voice_dest_path = os.path.join("mag", "Right", str(i))
    mag_R_instances_sub, _ = generatio_tensor_instances(mag_R, voice_dest_path, 100, 10, label)
    
    voice_dest_path = os.path.join("phase", "Left", str(i))
    phase_L_instances_sub, _ = generatio_tensor_instances(phase_L, voice_dest_path, 100, 10, label)
    
    voice_dest_path = os.path.join("phase", "Right", str(i))
    phase_R_instances_sub, label_sub = generatio_tensor_instances(phase_R, voice_dest_path, 100, 10, label)
    
    mag_L_instances.append(mag_L_instances_sub)
    mag_R_instances.append(mag_R_instances_sub)
    phase_L_instances.append(phase_L_instances_sub)
    phase_R_instances.append(phase_R_instances_sub)
    
    label_instances.append(np.array(label_sub))
    

print(len(mag_L_instances), len(phase_R_instances), len(label_instances))

mag_L_instances[0].shape, phase_R_instances[0].shape, label_instances[0].shape


315 315 315


((28, 512, 100, 1), (28, 512, 100, 1), (28,))

In [7]:
stacked_instances = []

for i in range(0, no_samples):
    concat_tensor = np.concatenate([mag_L_instances[i], phase_L_instances[i], 
                                    mag_R_instances[i], phase_R_instances[i]], axis = -1)
    stacked_instances.append(concat_tensor)
    
len(stacked_instances), stacked_instances[0].shape    # L, R magnitudes and phases are stacked.

(315, (28, 512, 100, 4))

In [8]:
import copy
vad_label_instances = copy.deepcopy(label_instances)

In [9]:
X = np.array(stacked_instances)  # transform the list to ndarray

y = np.array(label_instances)

y_vad = np.array(vad_label_instances)

print(X.shape, y.shape, y_vad.shape )


(315,) (315,) (315,)


In [10]:
def predict_utterances(model_path, samples_instances, samples_indices) :
    """
    samples_instances : ndarray holding 'samples' number of sample representation in ndarrays. 
                        Each sample has the shape: (instances_in_sample, 512, 100, 4)
    samples_indices : indices of the samples to consider in 'samples_instances'
    """
    labels_pred = [] 
    
    model = keras.models.load_model(model_path)
    
    X = samples_instances[[samples_indices]]
    
    for i, sample in enumerate(X) :   # for the instances in each utterance sample 
        x_L = sample[:, :, :, :2]
        x_R = sample[:, :, :, 2:]
        
        labels_pred.append( np.argmax(model.predict([x_L, x_R]), axis=1) )
        
    return np.array(labels_pred)

In [12]:
models = sorted(glob.glob(os.path.join('.', '*.h5')))

In [13]:
models

['./Best_SSL_STFT_Siamese_2DConv_1DConv.h5',
 './Best_SSL_STFT_Siamese_2DConv_RNN.h5']

In [14]:
idx = np.arange(0, no_samples)
# idx = np.random.permutation(no_samples)
# test_idx = idx[-10:]
test_idx = idx
test_idx

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [15]:
predictions=predict_utterances(models[0], X, test_idx)  # test against the first model 

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [16]:
sample_index = 33
print(y[test_idx][sample_index].shape, predictions[sample_index].shape)
list(zip(y[test_idx][sample_index], predictions[sample_index]))

(28,) (28,)


[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0)]

In [17]:
def class_instances_count(sample_instances_labels) : 
    """
    sample_instances_labels : class labels for the instances
    returns a list where elements are : (class_label, count)
    """
    # sample_instances_labels : class labels for the instances 
    import operator
    unique, counts = np.unique(sample_instances_labels, return_counts=True)
    dict_temp = dict(zip(unique, counts))
    return sorted(dict_temp.items(), key=operator.itemgetter(1), reverse=True)    

In [18]:
class_instances_count(predictions[sample_index])

[(0, 28)]

ensemble 

In [19]:
ensemble_pred = predict_utterances(models[0], X, test_idx)

for model in models[1: ] :
    predictions=predict_utterances(model, X, test_idx)
    for sample_ind in range(0, len(test_idx)):
        ensemble_pred[sample_ind] = np.concatenate([ensemble_pred[sample_ind], predictions[sample_ind]], 
                                                   axis=-1)

In [24]:
ensemble_predictions = []

for instances_labels in ensemble_pred :
    for class_label, class_count in class_instances_count(instances_labels) :
            ensemble_predictions.append(class_label)

            

list(zip(range(0, no_samples), ensemble_predictions))    

[(0, 0),
 (1, 8),
 (2, 0),
 (3, 8),
 (4, 0),
 (5, 8),
 (6, 0),
 (7, 8),
 (8, 0),
 (9, 8),
 (10, 0),
 (11, 8),
 (12, 0),
 (13, 8),
 (14, 0),
 (15, 8),
 (16, 0),
 (17, 8),
 (18, 0),
 (19, 8),
 (20, 0),
 (21, 8),
 (22, 0),
 (23, 8),
 (24, 0),
 (25, 8),
 (26, 0),
 (27, 8),
 (28, 0),
 (29, 8),
 (30, 0),
 (31, 8),
 (32, 0),
 (33, 8),
 (34, 0),
 (35, 8),
 (36, 0),
 (37, 0),
 (38, 8),
 (39, 0),
 (40, 8),
 (41, 0),
 (42, 8),
 (43, 0),
 (44, 8),
 (45, 0),
 (46, 8),
 (47, 0),
 (48, 8),
 (49, 0),
 (50, 8),
 (51, 0),
 (52, 8),
 (53, 0),
 (54, 8),
 (55, 0),
 (56, 8),
 (57, 0),
 (58, 8),
 (59, 0),
 (60, 8),
 (61, 0),
 (62, 8),
 (63, 0),
 (64, 8),
 (65, 0),
 (66, 8),
 (67, 0),
 (68, 8),
 (69, 0),
 (70, 8),
 (71, 0),
 (72, 8),
 (73, 0),
 (74, 8),
 (75, 0),
 (76, 8),
 (77, 0),
 (78, 8),
 (79, 0),
 (80, 8),
 (81, 0),
 (82, 8),
 (83, 0),
 (84, 0),
 (85, 8),
 (86, 0),
 (87, 8),
 (88, 0),
 (89, 8),
 (90, 0),
 (91, 8),
 (92, 0),
 (93, 8),
 (94, 0),
 (95, 8),
 (96, 0),
 (97, 8),
 (98, 0),
 (99, 8),
 (100, 0),

# making .json file

In [27]:
import json
from collections import OrderedDict

In [33]:
file_data = OrderedDict()
file_data['track3_results'] = []
ids = []
for i in range(no_samples) :
    if (ensemble_predictions[i] == 0) :
        ids.append({"id":i+1, "angle":-1})
        
    elif (ensemble_predictions[i] == 1) :
        ids.append({"id":i+1, "angle":0})
        
    elif (ensemble_predictions[i] == 2) :
        ids.append({"id":i+1, "angle":20})
        
    elif (ensemble_predictions[i] == 3) :
        ids.append({"id":i+1, "angle":40})
        
    elif (ensemble_predictions[i] == 4) :
        ids.append({"id":i+1, "angle":60})
    
    elif (ensemble_predictions[i] == 5):
        ids.append({"id":i+1, "angle":80})
    
    elif (ensemble_predictions[i] == 6) :
        ids.append({"id":i+1, "angle":100})
        
    elif (ensemble_predictions[i] == 7) :
        ids.append({"id":i+1, "angle":120})
        
    elif (ensemble_predictions[i] == 8) :
        ids.append({"id":i+1, "angle":140})
        
    elif (ensemble_predictions[i] == 9) :
        ids.append({"id":i+1, "angle":160})
        
    elif (ensemble_predictions[i] == 10) :
        ids.append({"id":i+1, "angle":180})
 
    
for i in range(no_samples) :
    file_data['track3_results'].append(ids[i])

    
# check json file
# print(json.dumps(file_data, ensure_ascii=False, indent='\t'))

{
	"track3_results": [
		{
			"id": 1,
			"angle": -1
		},
		{
			"id": 2,
			"angle": 140
		},
		{
			"id": 3,
			"angle": -1
		},
		{
			"id": 4,
			"angle": 140
		},
		{
			"id": 5,
			"angle": -1
		},
		{
			"id": 6,
			"angle": 140
		},
		{
			"id": 7,
			"angle": -1
		},
		{
			"id": 8,
			"angle": 140
		},
		{
			"id": 9,
			"angle": -1
		},
		{
			"id": 10,
			"angle": 140
		},
		{
			"id": 11,
			"angle": -1
		},
		{
			"id": 12,
			"angle": 140
		},
		{
			"id": 13,
			"angle": -1
		},
		{
			"id": 14,
			"angle": 140
		},
		{
			"id": 15,
			"angle": -1
		},
		{
			"id": 16,
			"angle": 140
		},
		{
			"id": 17,
			"angle": -1
		},
		{
			"id": 18,
			"angle": 140
		},
		{
			"id": 19,
			"angle": -1
		},
		{
			"id": 20,
			"angle": 140
		},
		{
			"id": 21,
			"angle": -1
		},
		{
			"id": 22,
			"angle": 140
		},
		{
			"id": 23,
			"angle": -1
		},
		{
			"id": 24,
			"angle": 140
		},
		{
			"id": 25,
			"angle": -1
		},
		{
			"id": 26,
			"angle": 140
		},
		{
			"id"

# saving json file

In [34]:
with open('results.json', 'w', encoding='utf-8') as make_file :
    json.dump(file_data, make_file, ensure_ascii=False, indent='\t')