In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import os
import pickle
from itertools import combinations
import random

from keras.models import Model, Input, Sequential, load_model
from keras.applications.vgg16 import VGG16
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras import backend as K
from keras.layers import Input, Conv2D, Lambda, merge, Dense, Flatten, MaxPooling2D, ZeroPadding2D, Dropout
from keras.optimizers import SGD,Adam

from src.data import process_audio
from src.data import display_audio



Using TensorFlow backend.


In [3]:
%load_ext autoreload
%autoreload 2

# Siamese_CNN model with 10 speakers and 3 audio files per speaker.

In [149]:
# Get audio files for 10 speakers
path = '/Users/greenapple/project5/data/raw/LibriSpeech/dev-clean' # folder with training data
files = process_audio.files_for_modeling_3_audios(path, 10)

# Number of files for analysis
len(files)

21

In [150]:
# Extract features
data = pd.DataFrame()
Xdb_3D_list = []
id_list = []

for file in files:
    Xdb_3D, speaker_id = process_audio.one_observation(file) # Features and label for one obervation = audio file
    
    Xdb_3D_list.append(Xdb_3D)
    id_list.append(speaker_id)
    
data['speaker_id'] = id_list
data['features'] = Xdb_3D_list  

In [151]:
# Process features for siamese model
siam_features = [comb for comb in combinations(data.features, 2)]
siam_targets_tup = [comb for comb in combinations(data.speaker_id, 2)]
siam_targets = [1 if a==b else 0 for a, b in siam_targets_tup]
# siam_indices = [comb for comb in combinations(list(range(data.shape[0])), 2)]

siam_data = pd.DataFrame()
siam_data['siam_targets'] = siam_targets
siam_data['siam_features'] = siam_features  

In [152]:
siam_data.shape

(210, 2)

In [153]:
# Reshape features and target for modeling
X = np.array(siam_data.siam_features.tolist())
y = np.array(siam_data.siam_targets.tolist())

In [154]:
y.shape

(210,)

In [155]:
X.shape

(210, 2, 224, 224, 3)

In [39]:
# Siamese model 1

# Build CNN branches
CNN = Sequential(name='01_CNN')
CNN.add(ZeroPadding2D((1,1),input_shape=(224,224,3)))
CNN.add(Conv2D(64, (3, 3), activation='relu'))
CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(64, (3, 3), activation='relu'))
CNN.add(MaxPooling2D((2,2), strides=(2,2)))
CNN.add(Dropout(0.2))

CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(128, (3, 3), activation='relu'))
CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(128, (3, 3), activation='relu'))
CNN.add(MaxPooling2D((2,2), strides=(2,2)))
CNN.add(Dropout(0.2))

CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(256, (3, 3), activation='relu'))
CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(256, (3, 3), activation='relu'))
CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(256, (3, 3), activation='relu'))
CNN.add(MaxPooling2D((2,2), strides=(2,2)))
CNN.add(Dropout(0.2))

CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(512, (3, 3), activation='relu'))
CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(512, (3, 3), activation='relu'))
CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(512, (3, 3), activation='relu'))
CNN.add(MaxPooling2D((2,2), strides=(2,2)))
CNN.add(Dropout(0.2))

CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(512, (3, 3), activation='relu'))
CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(512, (3, 3), activation='relu'))
CNN.add(ZeroPadding2D((1,1)))
CNN.add(Conv2D(512, (3, 3), activation='relu'))
CNN.add(MaxPooling2D((2,2), strides=(2,2)))
CNN.add(Dropout(0.2))

CNN.add(Flatten())
CNN.add(Dense(4096, activation='sigmoid'))

# Define tensors for two input audios
input_shape = (224, 224, 3)

left_input = Input(input_shape)
right_input = Input(input_shape)    

encoded_l = CNN(left_input)
encoded_r = CNN(right_input)

# Add a customized layer to compute the difference between the vectors/encodings
L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
L1_distance = L1_layer([encoded_l, encoded_r])
    
# Add a layer to classify the audios into "same speaker" = 1 or "not the same" = 0
classification = Dense(1,activation='sigmoid')(L1_distance)
    
# Siamese model
siamese_model_1 = Model(inputs=[left_input,right_input],outputs=classification)

# Adam optimizer
siamese_model_1.compile(optimizer=Adam(lr = 0.00006),
            loss="binary_crossentropy", 
              metrics=['accuracy'])


In [21]:
siamese_model_1.fit([X[:, 0], X[:, 1]], y, epochs=10, verbose=True, batch_size=32, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 168 samples, validate on 42 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x129b351d0>

In [41]:
conv_net.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

In [40]:
CNN.summary()

Model: "01_CNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d_14 (ZeroPaddi (None, 226, 226, 3)       0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 224, 224, 64)      1792      
_________________________________________________________________
zero_padding2d_15 (ZeroPaddi (None, 226, 226, 64)      0         
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 224, 224, 64)      36928     
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 112, 112, 64)      0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 112, 112, 64)      0         
_________________________________________________________________
zero_padding2d_16 (ZeroPaddi (None, 114, 114, 64)      0    

# Save model

In [44]:
siamese_model_1.save('/Users/greenapple/project5/models/siamese_model_1.h5') # save model to disc

In [52]:
siamese_model_2.save('/Users/greenapple/project5/models/02_siamese_model.h5') # save model to disc

In [75]:
# Training on 100 speakers 1 epoch
siamese_model_2.save('/Users/greenapple/project5/models/02_2_siamese_model.h5') # save model to disc

In [53]:
VGG16_like.save('/Users/greenapple/project5/models/VGG16_like.h5') # save model to disc

In [None]:
siamese_model_1.save_weights('/Users/greenapple/project5/models/01_siamese_weights.h5')

# Siamese_VGG16 model with 10 speakers and 3 audio files per speaker.

In [46]:
# Load CNN branches - VGG16
VGG16_model = VGG16(weights='imagenet', include_top=False, input_shape=(224,224, 3)) # remove top dense layers
 
for layer in VGG16_model.layers:     # freeze convolutional layers 
    layer.trainable = False    

x = VGG16_model.output
x = Flatten()(x) 
encodings = Dense(4096, activation='sigmoid')(x)

# VGG16 with dense layers replaced
VGG16_like = Model(inputs=VGG16_model.input, outputs=encodings, name='VGG16_like')
VGG16_like.compile(optimizer=Adam(lr = 0.00006), loss='binary_crossentropy')

# Define tensors for two input audios
input_shape = (224, 224, 3)

left_input = Input(input_shape)
right_input = Input(input_shape)    

encoded_l = VGG16_like(left_input)
encoded_r = VGG16_like(right_input)

# Add a customized layer to compute the difference between the vectors/encodings
L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
L1_distance = L1_layer([encoded_l, encoded_r])
    
# Add a layer to classify the audios into "same speaker" = 1 or "not the same" = 0
classification = Dense(1,activation='sigmoid')(L1_distance)
    
# Siamese model
siamese_model_2 = Model(inputs=[left_input,right_input],outputs=classification)

# Adam optimizer
siamese_model_2.compile(optimizer=Adam(lr = 0.00006),
            loss='binary_crossentropy', 
              metrics=['accuracy'])

In [50]:
siamese_model_1.summary() # model summary

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
01_CNN (Sequential)             (None, 4096)         117479232   input_12[0][0]                   
                                                                 input_13[0][0]                   
__________________________________________________________________________________________________
lambda_4 (Lambda)               (None, 4096)         0           01_CNN[1][0]               

In [49]:
VGG16_like.summary()

Model: "VGG16_like"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0

In [51]:
# Train model with 10 speakers, 3 audios per speaker
siamese_model_2.fit([X[:, 0], X[:, 1]], y, epochs=10, verbose=True, batch_size=32, validation_split=0.2)

Train on 168 samples, validate on 42 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b6f64438>

In [72]:
import src.data.process_audio

In [None]:
# Train model with 100 speakers, 3 audios per speaker
train_folder = '/Users/greenapple/project5/data/raw/LibriSpeech/train-clean-360'
X, y = process_audio.audio_to_features(train_folder, 100)

In [74]:
siamese_model_2.fit([X[:, 0], X[:, 1]], y, epochs=1, verbose=True, batch_size=32, validation_split=0.2)

Train on 32376 samples, validate on 8094 samples
Epoch 1/1


<keras.callbacks.History at 0x18631a3c8>

 # Load model

In [29]:
siamese_model_1 = load_model('/Users/greenapple/project5/models/01_siamese_model.h5') # load model

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [30]:
siamese_model_1.summary() # model summary

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
sequential_4 (Sequential)       (None, 4096)         117479232   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 4096)         0           sequential_4[1][0]         

In [None]:
score = siamese_model_1.evaluate(X, y, verbose=0) # evaluate model