In [26]:
import keras
import os
import sys
import logging
from keras.layers import Dense, BatchNormalization, Activation, Dropout, InputLayer,Conv2D, MaxPool2D, Flatten
from keras.models import Sequential
from keras_diagram import ascii
from keras.models import Model
import keras.backend as K
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib

%matplotlib inline
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import neural_network
from sklearn.model_selection import cross_val_score
import random

from sample_data_generator import SampleDataGenerator
import argparse

from deepspeaker.features.feature_extractors import LogMelFeatureExtractor, MfccFeatureExtractor

np.random.seed(42)
random.seed(42)

In [88]:
def baseline_cnn(shape, num_speakers=10, dropout=0.5):
    model = Sequential([
        InputLayer(input_shape=shape),
        Conv2D(filters=16, kernel_size=(3, 3), padding='same'),
        BatchNormalization(),
        Activation(activation='relu'),
        MaxPool2D(pool_size=(2, 2)),
        Dropout(0.5),

        Conv2D(filters=32, kernel_size=(3, 3), padding='same'),
        BatchNormalization(),
        Activation(activation='relu'),
        MaxPool2D(pool_size=(2, 2)),
        Dropout(0.5),

        Conv2D(filters=64, kernel_size=(3, 3), padding='same'),
        BatchNormalization(),
        Activation(activation='relu'),
        MaxPool2D(pool_size=(2, 2)),
        Dropout(0.5),
        Flatten(),

        Dense(512),
        BatchNormalization(),
        Activation(activation='relu'),
        Dropout(0.5),

        Dense(num_speakers, activation='softmax')
    ])
    print(ascii(model))
    return model

In [101]:
model = baseline_cnn((64,64,1), 10)
model.load_weights(
    '/home/jugs/Documents/SR Work/DeepSpeaker-dev-SEERNET3.x/deepspeaker/trainers/runs_dir/smallDSModel/models/weights.039-0.611.hdf5'
)

         InputLayer (None, 64, 64, 1)  
             Conv2D (None, 64, 64, 16) 
 BatchNormalization (None, 64, 64, 16) 
               Relu (None, 64, 64, 16) 
       MaxPooling2D (None, 32, 32, 16) 
            Dropout (None, 32, 32, 16) 
             Conv2D (None, 32, 32, 32) 
 BatchNormalization (None, 32, 32, 32) 
               Relu (None, 32, 32, 32) 
       MaxPooling2D (None, 16, 16, 32) 
            Dropout (None, 16, 16, 32) 
             Conv2D (None, 16, 16, 64) 
 BatchNormalization (None, 16, 16, 64) 
               Relu (None, 16, 16, 64) 
       MaxPooling2D (None, 8, 8, 64)   
            Dropout (None, 8, 8, 64)   
            Flatten (None, 4096)       
              Dense (None, 512)        
 BatchNormalization (None, 512)        
               Relu (None, 512)        
            Dropout (None, 512)        
              Dense (None, 10)         



In [90]:
def get_intermediate_output(model, intermediate_layer_num=4):
    return Model(inputs=model.input, outputs=model.layers[intermediate_layer_num].output)

In [91]:
# get intermediate layer output
intermediate_model_1 = get_intermediate_output(model, 12)

In [92]:
import librosa

def resample(y, sr, tr):
    y = librosa.core.resample(y=y, orig_sr=sr, target_sr=tr)
    return y

In [93]:
def _int64_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [None]:
def create_tfrec(x):
    writer = tf.python_io.TFRecordWriter("tfrec")
    pbar = tqdm(total=len(files), desc="Creating: {:>50}".format(filename))
    p = Pool(8)
    for i, examples in enumerate(p.imap_unordered(self.get_tfrecords, files)):
        for example in examples:
            cnt += 1
            writer.write(example)
        pbar.update()
    pbar.close()
    writer.close()
    return cnt

In [102]:
from pydub import AudioSegment
import tensorflow as tf

def featurize(x):
    features = []
    feature_extractor = LogMelFeatureExtractor(mels=64, context=64, log_mel=True, stride=3)
    audio = AudioSegment.from_file(x)
    y, sr = np.array(audio.get_array_of_samples(), dtype=float), float(audio.frame_rate)
    # y, sr = resample(y, sr, 16000.0)
    x_feat = feature_extractor(y=y, sr=sr)

    for j in range(x_feat[0].shape[0]):
        feature = tf.train.Example(features=tf.train.Features(feature={
            'x': _bytes_feature(x_feat[0][j].tostring()),
            't_dim': _int64_feature(x_feat[0][j].shape[0]),
        }))
        features.append(feature.SerializeToString())
    return features

In [105]:
# load data of speakers
x = '/home/jugs/Documents/SR Work/DeepSpeaker-dev-SEERNET3.x/deepspeaker/data_generators/LibriSpeechSamples/train-clean-100/26/496/26-496-0000.wav'
x = create_tfrec(x)
#x = featurize(x)
print(x)

model.predict(x)

'''
parser = argparse.ArgumentParser(description='get my data for eval.. ')
    parser.add_argument('file_1', type=str, help="Train TfRecord file path")
    parser.add_argument('file_2', type=str, help="Validation TfRecord file path")
    parser.add_argument('file_3', type=str, help="Test TfRecord file path")
    parser.add_argument('--batch_size', type=int, default=64, help="Batch size")
    #parser.add_argument('--model_type', type=str, default='dnn', help="Type of model to run",
    #                    choices=['dnn', 'rnn', 'cnn'])
    #parser.add_argument('--epochs', type=int, default=100, help="Number of epochs to train")
    parser.add_argument('--num_threads', type=int, default=4, help="Number of threads for enqueue operation")
    args = parser.parse_args()
'''

'''
data_generator = SampleDataGenerator(
        train_filenames=args.train_tfrecord_file,
        #validation_filenames=args.validation_tfrecord_file,
        #test_filenames=args.test_tfrecord_file,
        batch_size=args.batch_size,
        model_type=args.model_type,
        num_threads=args.num_threads
    )

x = data_generator.sample_data[0]
#y = np.load(open('/Users/venkatesh/my-github-repos/DeepSpeaker/y.npy'))
print(x.shape, y.shape)
'''

[b"\n\x9f\x02\n\x0e\n\x05t_dim\x12\x05\x1a\x03\n\x01@\n\x8c\x02\n\x01x\x12\x86\x02\n\x83\x02\n\x80\x02AZ\x8c@\x06\x1f%@o\xa3O@\x14\xad5@\x8711@\x94o\xf6>\xbf=F@h\x86'@{\xe4\x05@\xdf\x18\xeb>#\xa7\x10@\x94\xedV@H\xdd\x99@\n\x1f\xa9@\xaf\x98\x84@\x03\x8c\x8c@S\x04\xfb?\xec\xe3o@\xe5\x00\xa2@\xf6\x0c\x96@\xb0<\x96@\xb2}t@\x90\xd6^@\xbd\x86\x16@\xbfSh@\xde\x9c\x87@\xa3t\x83@\x9dKo@!n\xa0@\xf0\xa5\x9b@\x94\xcbS@\x97\xfb%@\xc03\x89@\xe6\xc9u@\xdd\xd1|@\x91\x04|@\x13\r\x86@\xb7-s@(a\x8e@\xbe7\x9f@\xce\x85\x98@w\n\x9c@\xc1\xe3\x99@T\xa0\xa6@\x97\x07\xb0@s_\xa0@LS\x9c@\xab\xbc\xb4@lE\xd3@0\xc6\xb6@%\x13\xa3@;=\xa9@u\n\xbf@Rf\xb8@\x87a\xac@\xe0[\xa8@6\xc7\x9f@\xeb\x8a\xaf@\xb51\xa9@b\xae\xb5@4\xcb\x99@R\xdf\xac@Z\xbb\xc4@\x19H\xb2@", b"\n\x9f\x02\n\x0e\n\x05t_dim\x12\x05\x1a\x03\n\x01@\n\x8c\x02\n\x01x\x12\x86\x02\n\x83\x02\n\x80\x02\xb0d/@b\x9e|@\x16\x13G@/\x01S@;>:@!\x965@|@Q@R26@\xb6n\xbf?\xbax\xae?k\xae=@E_3@\xae\x8b\x96@C\xc9\xa9@\xca\xc1\x99@\xe6\xf6\x81@#\xaa0@\xb9b\x80@\xfe\xef\x9f@>\xa1

ValueError: Error when checking : expected input_12 to have 4 dimensions, but got array with shape (64, 1)