<a href="https://colab.research.google.com/github/kerenalli/MyExamplePython/blob/main/LipReading_Beginning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
pip install sk-video

Collecting sk-video
[?25l  Downloading https://files.pythonhosted.org/packages/dd/3f/ce848b8b2062ad1ccf1449094a740c775f6c761339f411e44f1e090f23a7/sk_video-1.1.10-py2.py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 5.2MB/s 
Installing collected packages: sk-video
Successfully installed sk-video-1.1.10


In [5]:
import numpy as np
import cv2
import dlib
import math
import sys
import pickle
import argparse
import os
import skvideo.io

PART1: Construct the argument parse and parse the arguments

In [6]:
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True,
                help="path to input video file")
ap.add_argument("-o", "--output", required=True,
                help="path to output video file")
ap.add_argument("-f", "--fps", type=int, default=30,
                help="FPS of output video")
ap.add_argument("-c", "--codec", type=str, default="MJPG",
                help="codec of output video")
args = vars(ap.parse_args())

usage: ipykernel_launcher.py [-h] -i INPUT -o OUTPUT [-f FPS] [-c CODEC]
ipykernel_launcher.py: error: argument -f/--fps: invalid int value: '/root/.local/share/jupyter/runtime/kernel-2a2e324f-4e4b-4fec-b146-91470d9c1798.json'


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


PART2: Calling and defining required parameters for:

       1 - Processing video for extracting each frame.
       2 - Lip extraction from frames.

In [None]:
# Dlib requirements.
predictor_path = '/content/shape_predictor_68_face_landmarks.dat'
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(predictor_path)
mouth_destination_path = os.path.dirname(args["output"]) + '/' + 'mouth'
if not os.path.exists(mouth_destination_path):
    os.makedirs(mouth_destination_path)

inputparameters = {}
outputparameters = {}
reader = skvideo.io.FFmpegReader(args["input"],
                inputdict=inputparameters,
                outputdict=outputparameters)
video_shape = reader.getShape()
(num_frames, h, w, c) = video_shape
print(num_frames, h, w, c)


In [None]:
# The required parameters
activation = []
max_counter = 150
total_num_frames = int(video_shape[0])
num_frames = min(total_num_frames,max_counter)
counter = 0
font = cv2.FONT_HERSHEY_SIMPLEX

In [None]:
# Define the writer
writer = skvideo.io.FFmpegWriter(args["output"])

In [None]:
# Required parameters for mouth extraction.
width_crop_max = 0
height_crop_max = 0

'''
Processing parameters.

    activation: set to one if the full mouth can be extracted and set to zero otherwise.
    max_counter: How many frames will be processed.
    total_num_frames: Total number of frames for the video.
    num_frames: The number of frames which are subjected to be processed.
    counter: The frame counter.
'''

"""

In [None]:
"""
PART3: Processing the video.

Procedure:
     1 - Extracting each frame.
     2 - Detect the mouth in the frame.
     3 - Define a boarder around the mouth.
     4 - Crop and save the mouth.

Technical considerations:
     * - For the first frame the mouth is detected and by using a boarder the mouth is extracted and cropped.
     * - After the first frame the size of the cropped windows remains fixed unless for the subsequent frames
          a bigger windows is required. In such a case the windows size will be increased and it will be held
          fixed again unless increasing the size becoming necessary again too.
"""
# Loop over all frames.
for frame in reader.nextFrame():
    print('frame_shape:', frame.shape)

    # Process the video and extract the frames up to a certain number and then stop processing.
    if counter > num_frames:
        break

    # Detection of the frame
    frame.setflags(write=True)
    detections = detector(frame, 1)

    # 20 mark for mouth
    marks = np.zeros((2, 20))

    # All unnormalized face features.
    Features_Abnormal = np.zeros((190, 1))

    # If the face is detected.
    print(len(detections))
    if len(detections) > 0:
        for k, d in enumerate(detections):

            # Shape of the face.
            shape = predictor(frame, d)

            co = 0
            # Specific for the mouth.
            for ii in range(48, 68):
                """
                This for loop is going over all mouth-related features.
                X and Y coordinates are extracted and stored separately.
                """
                X = shape.part(ii)
                A = (X.x, X.y)
                marks[0, co] = X.x
                marks[1, co] = X.y
                co += 1

            # Get the extreme points(top-left & bottom-right)
            X_left, Y_left, X_right, Y_right = [int(np.amin(marks, axis=1)[0]), int(np.amin(marks, axis=1)[1]),
                                                int(np.amax(marks, axis=1)[0]),
                                                int(np.amax(marks, axis=1)[1])]

            # Find the center of the mouth.
            X_center = (X_left + X_right) / 2.0
            Y_center = (Y_left + Y_right) / 2.0

            # Make a boarder for cropping.
            border = 30
            X_left_new = X_left - border
            Y_left_new = Y_left - border
            X_right_new = X_right + border
            Y_right_new = Y_right + border

            # Width and height for cropping(before and after considering the border).
            width_new = X_right_new - X_left_new
            height_new = Y_right_new - Y_left_new
            width_current = X_right - X_left
            height_current = Y_right - Y_left

            # Determine the cropping rectangle dimensions(the main purpose is to have a fixed area).
            if width_crop_max == 0 and height_crop_max == 0:
                width_crop_max = width_new
                height_crop_max = height_new
            else:
                width_crop_max += 1.5 * np.maximum(width_current - width_crop_max, 0)
                height_crop_max += 1.5 * np.maximum(height_current - height_crop_max, 0)

            # # # Uncomment if the lip area is desired to be rectangular # # # #
            #########################################################
            # Find the cropping points(top-left and bottom-right).
            X_left_crop = int(X_center - width_crop_max / 2.0)
            X_right_crop = int(X_center + width_crop_max / 2.0)
            Y_left_crop = int(Y_center - height_crop_max / 2.0)
            Y_right_crop = int(Y_center + height_crop_max / 2.0)
            #########################################################

            # # # # # Uncomment if the lip area is desired to be rectangular # # # #
            # #######################################
            # # Use this part if the cropped area should look like a square.
            # crop_length_max = max(width_crop_max, height_crop_max) / 2
            #
            # # Find the cropping points(top-left and bottom-right).
            # X_left_crop = int(X_center - crop_length_max)
            # X_right_crop = int(X_center + crop_length_max)
            # Y_left_crop = int(Y_center - crop_length_max)
            # Y_right_crop = int(Y_center + crop_length_max)
            #########################################

            if X_left_crop >= 0 and Y_left_crop >= 0 and X_right_crop < w and Y_right_crop < h:
                mouth = frame[Y_left_crop:Y_right_crop, X_left_crop:X_right_crop, :]

                # Save the mouth area.
                mouth_gray = cv2.cvtColor(mouth, cv2.COLOR_RGB2GRAY)
                cv2.imwrite(mouth_destination_path + '/' + 'frame' + '_' + str(counter) + '.png', mouth_gray)

                print("The cropped mouth is detected ...")
                activation.append(1)
            else:
                cv2.putText(frame, 'The full mouth is not detectable. ', (30, 30), font, 1, (0, 255, 255), 2)
                print("The full mouth is not detectable. ...")
                activation.append(0)

    else:
        cv2.putText(frame, 'Mouth is not detectable. ', (30, 30), font, 1, (0, 0, 255), 2)
        print("Mouth is not detectable. ...")
        activation.append(0)


    if activation[counter] == 1:
        # Demonstration of face.
        cv2.rectangle(frame, (X_left_crop, Y_left_crop), (X_right_crop, Y_right_crop), (0, 255, 0), 2)

    # cv2.imshow('frame', frame)
    print('frame number %d of %d' % (counter, num_frames))

    # write the output frame to file
    print("writing frame %d with activation %d" % (counter + 1, activation[counter]))
    writer.writeFrame(frame)
    counter += 1

writer.close()

In [None]:

"""
PART4: Save the activation vector as a list.

The python script for loading a list:
    with open(the_filename, 'rb') as f:
        my_list = pickle.load(f)
"""

the_filename = os.path.dirname(args["output"]) + '/' + 'activation'
my_list = activation
with open(the_filename, 'wb') as f:
    pickle.dump(my_list, f)

# **Audio**

######################################
####### Define the dataset class #####
######################################
class AudioDataset():
    """Audio dataset."""

    def __init__(self, files_path, audio_dir, transform=None):
        """
        Args:
            files_path (string): Path to the .txt file which the address of files are saved in it.
            root_dir (string): Directory with all the audio files.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """

In [None]:
import os
from scipy.io.wavfile import read
import scipy.io.wavfile as wav
import subprocess as sp
import numpy as np
import argparse
import random
import os
import sys
from random import shuffle
import speechpy
import datetime


In [None]:
class AudioDataset():
    """Audio dataset."""

    def __init__(self, files_path, audio_dir, transform=None):
        """
        Args:
            files_path (string): Path to the .txt file which the address of files are saved in it.
            root_dir (string): Directory with all the audio files.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """

        # self.sound_files = [x.strip() for x in content]
        self.audio_dir = audio_dir
        self.transform = transform

        # Open the .txt file and create a list from each line.
        with open(files_path, 'r') as f:
            content = f.readlines()
        # you may also want to remove whitespace characters like `\n` at the end of each line
        list_files = []
        for x in content:
            sound_file_path = os.path.join(self.audio_dir, x.strip().split()[1])
            try:
                with open(sound_file_path, 'rb') as f:
                    riff_size, _ = wav._read_riff_chunk(f)
                    file_size = os.path.getsize(sound_file_path)

                # Assertion error.
                assert riff_size == file_size and os.path.getsize(sound_file_path) > 1000, "Bad file!"

                # Add to list if file is OK!
                list_files.append(x.strip())
            except OSError as err:
                print("OS error: {0}".format(err))
            except ValueError:
                print('file %s is corrupted!' % sound_file_path)
            # except:
            #     print("Unexpected error:", sys.exc_info()[0])
            #     raise

        # Save the correct and healthy sound files to a list.
        self.sound_files = list_files

    def __len__(self):
        return len(self.sound_files)

    def __getitem__(self, idx):
        # Get the sound file path
        sound_file_path = os.path.join(self.audio_dir, self.sound_files[idx].split()[1])

        ##############################
        ### Reading and processing ###
        ##############################

        # Reading .wav file
        fs, signal = wav.read(sound_file_path)

        # Reading .wav file
        import soundfile as sf
        signal, fs = sf.read(sound_file_path)

        ###########################
        ### Feature Extraction ####
        ###########################

        # DEFAULTS:
        num_coefficient = 40

        # Staching frames
        frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.02,
                                                  frame_stride=0.02,
                                                  zero_padding=True)

        # # Extracting power spectrum (choosing 3 seconds and elimination of DC)
        power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=2 * num_coefficient)[:, 1:]

        logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.02, frame_stride=0.02,
                                          num_filters=num_coefficient, fft_length=1024, low_frequency=0,
                                          high_frequency=None)
        

        ########################
        ### Handling sample ####
        ########################

        # Label extraction
        label = int(self.sound_files[idx].split()[0])

        sample = {'feature': logenergy, 'label': label}

        ########################
        ### Post Processing ####
        ########################
        if self.transform:
            sample = self.transform(sample)
        else:
            feature, label = sample['feature'], sample['label']
            sample = feature, label

        return sample
        # return sample


class CMVN(object):
    """Cepstral mean variance normalization.

    """

    def __call__(self, sample):
        feature, label = sample['feature'], sample['label']

        # Mean variance normalization of the spectrum.
        # The following line should be Uncommented if cepstral mean variance normalization is desired!
        feature = speechpy.processing.cmvn(feature, variance_normalization=True)

        return {'feature': feature, 'label': label}

In [None]:
class Extract_Derivative(object):
    """
    Extract derivative features.

    """

    def __call__(self, sample):
        feature, label = sample['feature'], sample['label']

        # Extract derivative features
        feature = speechpy.feature.extract_derivative_feature(feature)

        return {'feature': feature, 'label': label}

In [None]:
class Feature_Cube(object):
    """Return a feature cube of desired size.

    Args:
        cube_shape (tuple): The shape of the feature cube.
        ex: cube_shape=(15,40,3)
    """

    def __init__(self, cube_shape):
        
        self.cube_shape = cube_shape
        if self.cube_shape != None:
            self.num_frames = cube_shape[0]
            self.num_features = cube_shape[1]
            self.num_channels = cube_shape[2]


    def __call__(self, sample):
        feature, label = sample['feature'], sample['label']         

        if self.cube_shape != None:
            feature_cube = np.zeros((self.num_frames, self.num_features, self.num_channels), dtype=np.float32)
            feature_cube = feature[0:self.num_frames, :, :]
        else:
            feature_cube = feature
                 
        
        # return {'feature': feature_cube, 'label': label}
        return {'feature': feature_cube[None, :, :, :], 'label': label}

In [None]:
class ToOutput(object):
    """Return the output.

    """

    def __call__(self, sample):
        feature, label = sample['feature'], sample['label']

        return feature, label

In [None]:
class Compose(object):
    """Composes several transforms together.
    Args:
        transforms (list of ``Transform`` objects): list of transforms to compose.
    Example:
        >>> Compose([
        >>>     CMVN(),
        >>>     Feature_Cube(cube_shape=(20, 80, 40),
        >>>     augmentation=True), ToOutput(),
        >>>        ])
        If necessary, for the details of this class, please refer to Pytorch documentation.
    """

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img):
        for t in self.transforms:
            img = t(img)
        return img

    def __repr__(self):
        format_string = self.__class__.__name__ + '('
        for t in self.transforms:
            format_string += '\n'
            format_string += '    {0}'.format(t)
        format_string += '\n)'
        return format_string


if __name__ == '__main__':
    # add parser
    parser = argparse.ArgumentParser(description='Input pipeline')

    # The text file in which the paths to the audio files are available.
    # The path are relative to the directory of the audio files
    # Format of each line of the txt file is "class_label subject_dir/sound_file_name.ext"
    # Example of each line: 0 subject/sound.wav
    parser.add_argument('--file_path',
                        default=os.path.expanduser(
                            '~/github/3D-convolutional-speaker-recognition/code/0-input/file_path.txt'),
                        help='The file names for development phase')

    # The directory of the audio files separated by subject
    parser.add_argument('--audio_dir',
                        default=os.path.expanduser('~/github/lip-reading-deeplearning/code/speech-input/Audio'),
                        help='Location of sound files')
    args = parser.parse_args()

    dataset = AudioDataset(files_path=args.file_path, audio_dir=args.audio_dir,
                           transform=Compose([Extract_Derivative(), Feature_Cube(cube_shape=None), ToOutput()]))
    idx = 0
    feature, label = dataset.__getitem__(idx)
    print(feature.shape)
    print(label)

# **Train**

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

import sys
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.python.ops import control_flow_ops
from nets import nets_factory
from auxiliary import losses
from roc_curve import calculate_roc
import os

slim = tf.contrib.slim

######################
# Train Directory #
######################

tf.app.flags.DEFINE_string(
    'train_dir', os.path.expanduser('~/results/TRAIN_CNN_3D'),
    'Directory where checkpoints and event logs are written to.')

tf.app.flags.DEFINE_integer('num_clones', 1,
                            'Number of model clones to deploy.')

tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
                            'Use CPUs to deploy clones.')

tf.app.flags.DEFINE_integer(
    'log_every_n_steps', 1,
    'The frequency with which logs are print.')


######################
# Optimization Flags #
######################

tf.app.flags.DEFINE_float(
    'weight_decay', 0.00004, 'The weight decay on the model weights.')

tf.app.flags.DEFINE_string(
    'optimizer', 'adam',
    'The name of the optimizer, one of "adadelta", "adagrad", "adam",'
    '"ftrl", "momentum", "sgd" or "rmsprop".')


tf.app.flags.DEFINE_float(
    'adam_beta1', 0.9,
    'The exponential decay rate for the 1st moment estimates.')

tf.app.flags.DEFINE_float(
    'adam_beta2', 0.999,
    'The exponential decay rate for the 2nd moment estimates.')

tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.')

tf.app.flags.DEFINE_float(
    'momentum', 0.9,
    'The momentum for the MomentumOptimizer and RMSPropOptimizer.')

#######################
# Learning Rate Flags #
#######################

tf.app.flags.DEFINE_string(
    'learning_rate_decay_type',
    'exponential',
    'Specifies how the learning rate is decayed. One of "fixed", "exponential",'
    ' or "polynomial"')

tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')

tf.app.flags.DEFINE_float(
    'end_learning_rate', 0.0001,
    'The minimal end learning rate used by a polynomial decay learning rate.')

tf.app.flags.DEFINE_float(
    'label_smoothing', 0.0, 'The amount of label smoothing.')

tf.app.flags.DEFINE_float(
    'learning_rate_decay_factor', 0.94, 'Learning rate decay factor.')

tf.app.flags.DEFINE_float(
    'num_epochs_per_decay', 5.0,
    'Number of epochs after which learning rate decays.')

tf.app.flags.DEFINE_bool(
    'sync_replicas', False,
    'Whether or not to synchronize the replicas during training.')

tf.app.flags.DEFINE_integer(
    'replicas_to_aggregate', 1,
    'The Number of gradients to collect before updating params.')

tf.app.flags.DEFINE_float(
    'moving_average_decay', None,
    'The decay to use for the moving average.'
    'If left as None, then moving averages are not used.')

#######################
# Dataset Flags #
#######################

tf.app.flags.DEFINE_string(
    'model_speech_name', 'lipread_speech', 'The name of the architecture to train.')

tf.app.flags.DEFINE_string(
    'model_mouth_name', 'lipread_mouth', 'The name of the architecture to train.')


tf.app.flags.DEFINE_integer(
    'batch_size', 32, 'The number of samples in each batch.')

tf.app.flags.DEFINE_integer(
    'num_epochs', 1, 'The number of epochs for training.')


#####################
# Fine-Tuning Flags #
#####################

tf.app.flags.DEFINE_string(
    'checkpoint_path', None,
    'The path to a checkpoint from which to fine-tune. ex:/home/sina/TRAIN_CASIA/train_logs/vgg_19.cpkt')

tf.app.flags.DEFINE_string(
    'checkpoint_exclude_scopes', None,
    'Comma-separated list of scopes of variables to exclude when restoring'
    'from a checkpoint. ex: vgg_19/fc8/biases,vgg_19/fc8/weights')

tf.app.flags.DEFINE_string(
    'trainable_scopes', None,
    'Comma-separated list of scopes to filter the set of variables to train.'
    'By default, None would train all the variables.')

tf.app.flags.DEFINE_boolean(
    'ignore_missing_vars', False,
    'When restoring a checkpoint would ignore missing variables.')

# Store all elemnts in FLAG structure!
FLAGS = tf.app.flags.FLAGS


def _configure_learning_rate(num_samples_per_epoch, global_step):
    """Configures the learning rate.

    Args:
      num_samples_per_epoch: The number of samples in each epoch of training.
      global_step: The global_step tensor.

    Returns:
      A `Tensor` representing the learning rate.

    Raises:
      ValueError: if
    """
    decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
                      FLAGS.num_epochs_per_decay)
    if FLAGS.sync_replicas:
        decay_steps /= FLAGS.replicas_to_aggregate

    if FLAGS.learning_rate_decay_type == 'exponential':
        return tf.train.exponential_decay(FLAGS.learning_rate,
                                          global_step,
                                          decay_steps,
                                          FLAGS.learning_rate_decay_factor,
                                          staircase=True,
                                          name='exponential_decay_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'fixed':
        return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'polynomial':
        return tf.train.polynomial_decay(FLAGS.learning_rate,
                                         global_step,
                                         decay_steps,
                                         FLAGS.end_learning_rate,
                                         power=1.0,
                                         cycle=False,
                                         name='polynomial_decay_learning_rate')
    else:
        raise ValueError('learning_rate_decay_type [%s] was not recognized',
                         FLAGS.learning_rate_decay_type)


def _configure_optimizer(learning_rate):
    """Configures the optimizer used for training.

    Args:
      learning_rate: A scalar or `Tensor` learning rate.

    Returns:
      An instance of an optimizer.

    Raises:
      ValueError: if FLAGS.optimizer is not recognized.
    """

    if FLAGS.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(
            learning_rate,
            beta1=FLAGS.adam_beta1,
            beta2=FLAGS.adam_beta2,
            epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'sgd':
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    else:
        raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer)
    return optimizer


def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.

    Note that this function provides a synchronization point across all towers.

    Args:
      tower_grads: List of lists of (gradient, variable) tuples. The outer list
        is over individual gradients. The inner list is over the gradient
        calculation for each tower.
    Returns:
       List of pairs of (gradient, variable) where the gradient has been averaged
       across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(axis=0, values=grads)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads


def _get_variables_to_train():
    """Returns a list of variables to train.

    Returns:
      A list of variables to train by the optimizer.
    """
    if FLAGS.trainable_scopes is None:
        return tf.trainable_variables()
    else:
        scopes = [scope.strip() for scope in FLAGS.trainable_scopes.split(',')]

    variables_to_train = []
    for scope in scopes:
        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        variables_to_train.extend(variables)
    return variables_to_train


# Definign arbitrary data
num_training_samples = 1000
num_testing_samples = 1000
train_data = {}

train_data = {'mouth': np.random.random_sample(size=(num_training_samples, 9, 60, 100, 1)),
              'speech': np.random.random_sample(size=(num_training_samples, 15, 40, 1, 3))}
test_data = {'mouth': np.random.random_sample(size=(num_testing_samples, 9, 60, 100, 1)),
             'speech': np.random.random_sample(size=(num_testing_samples, 15, 40, 1, 3))}

train_label = np.random.randint(2, size=(num_training_samples, 1))
test_label = np.random.randint(2, size=(num_testing_samples, 1))


# # Uncomment if data standardalization is required and the mean and std vectors have been calculated.
# ############ Get the mean vectors ####################
#
# # mean mouth
# mean_mouth = np.load('/path/to/mean/file/mouth.npy')
# # mean_mouth = np.tile(mean_mouth.reshape(47, 73, 1), (1, 1, 9))
# mean_mouth = mean_mouth[None, :]
# mean_channel_mouth = np.mean(mean_mouth)
#
# # mean speech
# mean_speech = np.load('/path/to/mean/file/speech.npy')
# mean_speech = mean_speech[None, :]
# # mean_channel_speech = np.hstack((
# #     [np.mean(mean_speech[:, :, :, 0])], [np.mean(mean_speech[:, :, :, 1])], [np.mean(mean_speech[:, :, :, 2])]))
#
# ############ Get the std vectors ####################
#
# # mean std
# std_mouth = np.load('/path/to/std/file/mouth.npy')
# std_mouth = np.tile(std_mouth.reshape(60, 100, 1), (1, 1, 9))
# std_mouth = std_mouth[None, :]
#
# # mean speech
# std_speech = np.load('/path/to/std/file/speech.npy')
# std_speech = std_speech[None, :]




def main(_):


    tf.logging.set_verbosity(tf.logging.INFO)

    graph = tf.Graph()
    with graph.as_default(), tf.device('/cpu:0'):
        ######################
        # Config model_deploy#
        ######################

        # required from data
        num_samples_per_epoch = train_data['mouth'].shape[0]
        num_batches_per_epoch = int(num_samples_per_epoch / FLAGS.batch_size)

        num_samples_per_epoch_test = test_data['mouth'].shape[0]
        num_batches_per_epoch_test = int(num_samples_per_epoch_test / FLAGS.batch_size)

        # Create global_step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        #########################################
        # Configure the larning rate. #
        #########################################
        learning_rate = _configure_learning_rate(num_samples_per_epoch, global_step)
        opt = _configure_optimizer(learning_rate)

        ######################
        # Select the network #
        ######################
        is_training = tf.placeholder(tf.bool)

        network_speech_fn = nets_factory.get_network_fn(
            FLAGS.model_speech_name,
            num_classes=2,
            weight_decay=FLAGS.weight_decay,
            is_training=is_training)

        network_mouth_fn = nets_factory.get_network_fn(
            FLAGS.model_mouth_name,
            num_classes=2,
            weight_decay=FLAGS.weight_decay,
            is_training=is_training)

        #####################################
        # Select the preprocessing function #
        #####################################

        # TODO: Do some preprocessing if necessary.

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        # with tf.device(deploy_config.inputs_device()):
        """
        Define the place holders and creating the batch tensor.
        """

        # Mouth spatial set
        INPUT_SEQ_LENGTH = 9
        INPUT_HEIGHT = 60
        INPUT_WIDTH = 100
        INPUT_CHANNELS = 1
        batch_mouth = tf.placeholder(tf.float32, shape=(
            [None, INPUT_SEQ_LENGTH, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNELS]))

        # Speech spatial set
        INPUT_SEQ_LENGTH_SPEECH = 15
        INPUT_HEIGHT_SPEECH = 40
        INPUT_WIDTH_SPEECH = 1
        INPUT_CHANNELS_SPEECH = 3
        batch_speech = tf.placeholder(tf.float32, shape=(
            [None, INPUT_SEQ_LENGTH_SPEECH, INPUT_HEIGHT_SPEECH, INPUT_WIDTH_SPEECH, INPUT_CHANNELS_SPEECH]))

        # Label
        batch_labels = tf.placeholder(tf.uint8, (None, 1))
        margin_imp_tensor = tf.placeholder(tf.float32, ())

        ################################
        ## Feed forwarding to network ##
        ################################
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in range(FLAGS.num_clones):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % ('tower', i)) as scope:
                        """
                        Two distance metric are defined:
                           1 - distance_weighted: which is a weighted average of the distance between two structures.
                           2 - distance_l2: which is the regular l2-norm of the two networks outputs.
                        Place holders

                        """
                        ########################################
                        ######## Outputs of two networks #######
                        ########################################

                        logits_speech, end_points_speech = network_speech_fn(batch_speech)
                        logits_mouth, end_points_mouth = network_mouth_fn(batch_mouth)

                        # # Uncomment if the output embedding is desired to be as |f(x)| = 1
                        # logits_speech = tf.nn.l2_normalize(logits_speech, dim=1, epsilon=1e-12, name=None)
                        # logits_mouth = tf.nn.l2_normalize(logits_mouth, dim=1, epsilon=1e-12, name=None)

                        #################################################
                        ########### Loss Calculation ####################
                        #################################################

                        # ##### Weighted distance using a fully connected layer #####
                        # distance_vector = tf.subtract(logits_speech, logits_mouth,  name=None)
                        # distance_weighted = slim.fully_connected(distance_vector, 1, activation_fn=tf.nn.sigmoid,
                        #                                          normalizer_fn=None,
                        #                                          scope='fc_weighted')

                        ##### Euclidean distance ####
                        distance_l2 = tf.sqrt(
                            tf.reduce_sum(tf.pow(tf.subtract(logits_speech, logits_mouth), 2), 1, keepdims=True))

                        ##### Contrastive loss ######
                        loss = losses.contrastive_loss(batch_labels, distance_l2, margin_imp=margin_imp_tensor,
                                                       scope=scope)

                        # ##### call the optimizer ######
                        # # TODO: call optimizer object outside of this gpu environment
                        #
                        # Reuse variables for the next tower.
                        tf.get_variable_scope().reuse_variables()

                        # Calculate the gradients for the batch of data on this CIFAR tower.
                        grads = opt.compute_gradients(loss)

                        # Keep track of the gradients across all towers.
                        tower_grads.append(grads)


        # Calculate the mean of each gradient.
        grads = average_gradients(tower_grads)

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Track the moving averages of all trainable variables.
        MOVING_AVERAGE_DECAY = 0.9999
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        #################################################
        ########### Summary Section #####################
        #################################################

        # Gather initial summaries.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        # Add summaries for all end_points.
        for end_point in end_points_speech:
            x = end_points_speech[end_point]
            # summaries.add(tf.summary.histogram('activations_speech/' + end_point, x))
            summaries.add(tf.summary.scalar('sparsity_speech/' + end_point,
                                            tf.nn.zero_fraction(x)))

        for end_point in end_points_mouth:
            x = end_points_mouth[end_point]
            # summaries.add(tf.summary.histogram('activations_mouth/' + end_point, x))
            summaries.add(tf.summary.scalar('sparsity_mouth/' + end_point,
                                            tf.nn.zero_fraction(x)))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.summary.histogram(variable.op.name, variable))

        # Add to parameters to summaries
        summaries.add(tf.summary.scalar('learning_rate', learning_rate))
        summaries.add(tf.summary.scalar('global_step', global_step))
        summaries.add(tf.summary.scalar('eval/Loss', loss))
        summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

    ###########################
    ######## Training #########
    ###########################

    with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:

        # Initialization of the network.
        variables_to_restore = slim.get_variables_to_restore()
        saver = tf.train.Saver(variables_to_restore, max_to_keep=20)
        coord = tf.train.Coordinator()
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        # # Restore the model
        # saver.restore(sess, '/home/sina/TRAIN_LIPREAD/train_logs-1366')

        # op to write logs to Tensorboard
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=graph)

        #####################################
        ############## TRAIN ################
        #####################################

        step = 1
        for epoch in range(FLAGS.num_epochs):

            # Loop over all batches

            for batch_num in range(num_batches_per_epoch):
                step += 1
                start_idx = batch_num * FLAGS.batch_size
                end_idx = (batch_num + 1) * FLAGS.batch_size
                speech_train, mouth_train, label_train = train_data['speech'][start_idx:end_idx], train_data['mouth'][
                                                                                                  start_idx:end_idx], train_label[
                                                                                                                      start_idx:end_idx]

                # # # Standardalization for speech if necessary
                # speech_train = (speech_train - mean_speech) / std_speech
                #
                # # # Standardalization  for visual if necessary
                # mouth_train = (mouth_train - mean_mouth) / std_mouth

                #########################################################################
                ################## Online Pair Selection Algorithm ######################
                #########################################################################
                online_pair_selection = True
                if online_pair_selection:
                    distance = sess.run(
                        distance_l2,
                        feed_dict={is_training: False, batch_speech: speech_train,
                                   batch_mouth: mouth_train,
                                   batch_labels: label_train.reshape([FLAGS.batch_size, 1])})
                    label_keep = []

                    ###############################
                    hard_margin = 10

                    # Max-Min distance in genuines
                    max_gen = 0
                    min_gen = 100
                    for j in range(label_train.shape[0]):
                        if label_train[j] == 1:
                            if max_gen < distance[j, 0]:
                                max_gen = distance[j, 0]
                            if min_gen > distance[j, 0]:
                                min_gen = distance[j, 0]

                    # Min-Max distance in impostors
                    min_imp = 100
                    max_imp = 0
                    for k in range(label_train.shape[0]):
                        if label_train[k] == 0:
                            if min_imp > distance[k, 0]:
                                min_imp = distance[k, 0]
                            if max_imp < distance[k, 0]:
                                max_imp = distance[k, 0]

                    ### Keeping hard impostors and genuines
                    for i in range(label_train.shape[0]):
                        # imposter
                        if label_train[i] == 0:
                            if distance[i, 0] < max_gen + hard_margin:
                                label_keep.append(i)
                        elif label_train[i] == 1:
                            # if distance[i, 0] > min_imp - hard_margin:
                            label_keep.append(i)

                    #### Choosing the pairs ######
                    speech_train = speech_train[label_keep]
                    mouth_train = mouth_train[label_keep]
                    label_train = label_train[label_keep]

                ############################################
                #### Running the training operation ########
                _, loss_value, score_dissimilarity, summary, training_step, _ = sess.run(
                    [train_op, loss, distance_l2, summary_op, global_step, is_training],
                    feed_dict={is_training: True, margin_imp_tensor: 100,
                               batch_speech: speech_train, batch_mouth: mouth_train,
                               batch_labels: label_train.reshape([label_train.shape[0], 1])})
                summary_writer.add_summary(summary, epoch * num_batches_per_epoch + i)

                # try and error method is used to handle the error due to ROC calculation
                try:
                    # Calculation of ROC
                    EER, AUC, AP, fpr, tpr = calculate_roc.calculate_eer_auc_ap(label_train, score_dissimilarity)

                    if (batch_num + 1) % FLAGS.log_every_n_steps == 0:
                        print("Epoch " + str(epoch + 1) + ", Minibatch " + str(
                            batch_num + 1) + " of %d " % num_batches_per_epoch + ", Minibatch Loss= " + \
                              "{:.6f}".format(loss_value) + ", EER= " + "{:.5f}".format(EER) + ", AUC= " + "{:.5f}".format(
                            AUC) + ", AP= " + "{:.5f}".format(AP) + ", contrib = %d pairs" % label_train.shape[0])
                except:
                    print("Error: " ,sys.exc_info()[0])
                    print("No contributing impostor pair!")

            # Save the model
            saver.save(sess, FLAGS.train_dir, global_step=training_step)

            ###################################################
            ############## TEST PER EACH EPOCH ################
            ###################################################
            score_dissimilarity_vector = np.zeros((FLAGS.batch_size * num_batches_per_epoch_test, 1))
            label_vector = np.zeros((FLAGS.batch_size * num_batches_per_epoch_test, 1))

            # Loop over all batches
            for i in range(num_batches_per_epoch_test):
                start_idx = i * FLAGS.batch_size
                end_idx = (i + 1) * FLAGS.batch_size
                speech_test, mouth_test, label_test = test_data['speech'][start_idx:end_idx], test_data['mouth'][
                                                                                              start_idx:end_idx], test_label[
                                                                                                                  start_idx:end_idx]

                # # # Uncomment if standardalization is needed
                # # mean subtraction if necessary
                # speech_test = (speech_test - mean_speech) / std_speech
                # mouth_test = (mouth_test - mean_mouth) / std_mouth

                # Evaluation phase
                # WARNING: margin_imp_tensor has no effect here but it needs to be there because its tensor required a value to feed in!!
                loss_value, score_dissimilarity, _ = sess.run([loss, distance_l2, is_training],
                                                              feed_dict={is_training: False,
                                                                         margin_imp_tensor: 50,
                                                                         batch_speech: speech_test,
                                                                         batch_mouth: mouth_test,
                                                                         batch_labels: label_test.reshape(
                                                                             [FLAGS.batch_size, 1])})
                if (i + 1) % FLAGS.log_every_n_steps == 0:
                    print("TESTING: Epoch " + str(epoch + 1) + ", Minibatch " + str(
                        i + 1) + " of %d " % num_batches_per_epoch_test)
                score_dissimilarity_vector[start_idx:end_idx] = score_dissimilarity
                label_vector[start_idx:end_idx] = label_test

            ##############################
            ##### K-fold validation ######
            ##############################
            K = 10
            EER = np.zeros((K, 1))
            AUC = np.zeros((K, 1))
            AP = np.zeros((K, 1))
            batch_k_validation = int(label_vector.shape[0] / float(K))

            for i in range(K):
                EER[i, :], AUC[i, :], AP[i, :], fpr, tpr = calculate_roc.calculate_eer_auc_ap(
                    label_vector[i * batch_k_validation:(i + 1) * batch_k_validation],
                    score_dissimilarity_vector[i * batch_k_validation:(i + 1) * batch_k_validation])

            # Printing Equal Error Rate(EER), Area Under the Curve(AUC) and Average Precision(AP)
            print("TESTING: Epoch " + str(epoch + 1) + ", EER= " + str(np.mean(EER, axis=0)) + ", AUC= " + str(
                np.mean(AUC, axis=0)) + ", AP= " + str(np.mean(AP, axis=0)))


if __name__ == '__main__':
    tf.app.run()


# **Testing**

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

import sys
import numpy as np
from sklearn.model_selection import KFold
from tensorflow.python.ops import control_flow_ops
from nets import nets_factory
from auxiliary import losses
from roc_curve import calculate_roc
import os
# import matplotlib.pyplot as plt
slim = tf.contrib.slim

######################
# Train Directory #
######################

tf.app.flags.DEFINE_string(
    'test_dir', 'results/TRAIN_CNN_3D/test_logs',
    'Directory where checkpoints and event logs are written to.')

tf.app.flags.DEFINE_string(
    'checkpoint_dir', os.path.expanduser('~/results/'),
    'Directory where checkpoints and event logs are written to.')


tf.app.flags.DEFINE_integer('num_clones', 1,
                            'Number of model clones to deploy.')

tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
                            'Use CPUs to deploy clones.')

tf.app.flags.DEFINE_integer(
    'log_every_n_steps', 1,
    'The frequency with which logs are print.')


######################
# Optimization Flags #
######################

tf.app.flags.DEFINE_float(
    'weight_decay', 0.00004, 'The weight decay on the model weights.')

tf.app.flags.DEFINE_string(
    'optimizer', 'adam',
    'The name of the optimizer, one of "adadelta", "adagrad", "adam",'
    '"ftrl", "momentum", "sgd" or "rmsprop".')


tf.app.flags.DEFINE_float(
    'adam_beta1', 0.9,
    'The exponential decay rate for the 1st moment estimates.')

tf.app.flags.DEFINE_float(
    'adam_beta2', 0.999,
    'The exponential decay rate for the 2nd moment estimates.')

tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.')

tf.app.flags.DEFINE_float(
    'momentum', 0.9,
    'The momentum for the MomentumOptimizer and RMSPropOptimizer.')

#######################
# Learning Rate Flags #
#######################

tf.app.flags.DEFINE_string(
    'learning_rate_decay_type',
    'exponential',
    'Specifies how the learning rate is decayed. One of "fixed", "exponential",'
    ' or "polynomial"')
 
tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')

tf.app.flags.DEFINE_float(
    'end_learning_rate', 0.0001,
    'The minimal end learning rate used by a polynomial decay learning rate.')

tf.app.flags.DEFINE_float(
    'label_smoothing', 0.0, 'The amount of label smoothing.')

tf.app.flags.DEFINE_float(
    'learning_rate_decay_factor', 0.94, 'Learning rate decay factor.')

tf.app.flags.DEFINE_float(
    'num_epochs_per_decay', 5.0,
    'Number of epochs after which learning rate decays.')

tf.app.flags.DEFINE_bool(
    'sync_replicas', False,
    'Whether or not to synchronize the replicas during training.')

tf.app.flags.DEFINE_integer(
    'replicas_to_aggregate', 1,
    'The Number of gradients to collect before updating params.')

tf.app.flags.DEFINE_float(
    'moving_average_decay', None,
    'The decay to use for the moving average.'
    'If left as None, then moving averages are not used.')

#######################
# Dataset Flags #
#######################

tf.app.flags.DEFINE_string(
    'model_speech_name', 'lipread_speech', 'The name of the architecture to train.')

tf.app.flags.DEFINE_string(
    'model_mouth_name', 'lipread_mouth', 'The name of the architecture to train.')


tf.app.flags.DEFINE_integer(
    'batch_size', 128, 'The number of samples in each batch.')

tf.app.flags.DEFINE_integer(
    'num_epochs', 20, 'The number of epochs for training.')


#####################
# Fine-Tuning Flags #
#####################

tf.app.flags.DEFINE_string(
    'checkpoint_path', None,
    'The path to a checkpoint from which to fine-tune. ex:/home/user/TRAIN/train_logs')

tf.app.flags.DEFINE_string(
    'checkpoint_exclude_scopes', None,
    'Comma-separated list of scopes of variables to exclude when restoring'
    'from a checkpoint. ex: vgg_19/fc8/biases,vgg_19/fc8/weights')

tf.app.flags.DEFINE_string(
    'trainable_scopes', None,
    'Comma-separated list of scopes to filter the set of variables to train.'
    'By default, None would train all the variables.')

tf.app.flags.DEFINE_boolean(
    'ignore_missing_vars', False,
    'When restoring a checkpoint would ignore missing variables.')

# Store all elemnts in FLAG structure!
FLAGS = tf.app.flags.FLAGS


def _configure_learning_rate(num_samples_per_epoch, global_step):
    """Configures the learning rate.

    Args:
      num_samples_per_epoch: The number of samples in each epoch of training.
      global_step: The global_step tensor.

    Returns:
      A `Tensor` representing the learning rate.

    Raises:
      ValueError: if
    """
    decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
                      FLAGS.num_epochs_per_decay)
    if FLAGS.sync_replicas:
        decay_steps /= FLAGS.replicas_to_aggregate

    if FLAGS.learning_rate_decay_type == 'exponential':
        return tf.train.exponential_decay(FLAGS.learning_rate,
                                          global_step,
                                          decay_steps,
                                          FLAGS.learning_rate_decay_factor,
                                          staircase=True,
                                          name='exponential_decay_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'fixed':
        return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'polynomial':
        return tf.train.polynomial_decay(FLAGS.learning_rate,
                                         global_step,
                                         decay_steps,
                                         FLAGS.end_learning_rate,
                                         power=1.0,
                                         cycle=False,
                                         name='polynomial_decay_learning_rate')
    else:
        raise ValueError('learning_rate_decay_type [%s] was not recognized',
                         FLAGS.learning_rate_decay_type)


def _configure_optimizer(learning_rate):
    """Configures the optimizer used for training.

    Args:
      learning_rate: A scalar or `Tensor` learning rate.

    Returns:
      An instance of an optimizer.

    Raises:
      ValueError: if FLAGS.optimizer is not recognized.
    """

    if FLAGS.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(
            learning_rate,
            beta1=FLAGS.adam_beta1,
            beta2=FLAGS.adam_beta2,
            epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'sgd':
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    else:
        raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer)
    return optimizer


def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.

    Note that this function provides a synchronization point across all towers.

    Args:
      tower_grads: List of lists of (gradient, variable) tuples. The outer list
        is over individual gradients. The inner list is over the gradient
        calculation for each tower.
    Returns:
       List of pairs of (gradient, variable) where the gradient has been averaged
       across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(axis=0, values=grads)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads


def _get_variables_to_train():
    """Returns a list of variables to train.

    Returns:
      A list of variables to train by the optimizer.
    """
    if FLAGS.trainable_scopes is None:
        return tf.trainable_variables()
    else:
        scopes = [scope.strip() for scope in FLAGS.trainable_scopes.split(',')]

    variables_to_train = []
    for scope in scopes:
        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        variables_to_train.extend(variables)
    return variables_to_train


# Definign arbitrary data
num_training_samples = 1000
num_testing_samples = 1000
train_data = {}

train_data = {'mouth': np.random.random_sample(size=(num_training_samples, 9, 60, 100, 1)),
              'speech': np.random.random_sample(size=(num_training_samples, 15, 40, 1, 3))}
test_data = {'mouth': np.random.random_sample(size=(num_testing_samples, 9, 60, 100, 1)),
             'speech': np.random.random_sample(size=(num_testing_samples, 15, 40, 1, 3))}

train_label = np.random.randint(2, size=(num_training_samples, 1))
test_label = np.random.randint(2, size=(num_testing_samples, 1))


# # Uncomment if data standardalization is required and the mean and std vectors have been calculated.
# ############ Get the mean vectors ####################
#
# # mean mouth
# mean_mouth = np.load('/path/to/mean/file/mouth.npy')
# # mean_mouth = np.tile(mean_mouth.reshape(47, 73, 1), (1, 1, 9))
# mean_mouth = mean_mouth[None, :]
# mean_channel_mouth = np.mean(mean_mouth)
#
# # mean speech
# mean_speech = np.load('/path/to/mean/file/speech.npy')
# mean_speech = mean_speech[None, :]
# # mean_channel_speech = np.hstack((
# #     [np.mean(mean_speech[:, :, :, 0])], [np.mean(mean_speech[:, :, :, 1])], [np.mean(mean_speech[:, :, :, 2])]))
#
# ############ Get the std vectors ####################
#
# # mean std
# std_mouth = np.load('/path/to/std/file/mouth.npy')
# std_mouth = np.tile(std_mouth.reshape(60, 100, 1), (1, 1, 9))
# std_mouth = std_mouth[None, :]
#
# # mean speech
# std_speech = np.load('/path/to/std/file/speech.npy')
# std_speech = std_speech[None, :]




def main(_):


    tf.logging.set_verbosity(tf.logging.INFO)

    graph = tf.Graph()
    with graph.as_default(), tf.device('/cpu:0'):
        ######################
        # Config model_deploy#
        ######################

        # required from data
        num_samples_per_epoch = train_data['mouth'].shape[0]
        num_batches_per_epoch = int(num_samples_per_epoch / FLAGS.batch_size)

        num_samples_per_epoch_test = test_data['mouth'].shape[0]
        num_batches_per_epoch_test = int(num_samples_per_epoch_test / FLAGS.batch_size)

        # Create global_step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        #########################################
        # Configure the larning rate. #
        #########################################
        learning_rate = _configure_learning_rate(num_samples_per_epoch, global_step)
        opt = _configure_optimizer(learning_rate)

        ######################
        # Select the network #
        ######################
        is_training = tf.placeholder(tf.bool)

        network_speech_fn = nets_factory.get_network_fn(
            FLAGS.model_speech_name,
            num_classes=2,
            weight_decay=FLAGS.weight_decay,
            is_training=is_training)

        network_mouth_fn = nets_factory.get_network_fn(
            FLAGS.model_mouth_name,
            num_classes=2,
            weight_decay=FLAGS.weight_decay,
            is_training=is_training)

        #####################################
        # Select the preprocessing function #
        #####################################

        # TODO: Do some preprocessing if necessary.

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        # with tf.device(deploy_config.inputs_device()):
        """
        Define the place holders and creating the batch tensor.
        """

        # Mouth spatial set
        INPUT_SEQ_LENGTH = 9
        INPUT_HEIGHT = 60
        INPUT_WIDTH = 100
        INPUT_CHANNELS = 1
        batch_mouth = tf.placeholder(tf.float32, shape=(
            [None, INPUT_SEQ_LENGTH, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNELS]))

        # Speech spatial set
        INPUT_SEQ_LENGTH_SPEECH = 15
        INPUT_HEIGHT_SPEECH = 40
        INPUT_WIDTH_SPEECH = 1
        INPUT_CHANNELS_SPEECH = 3
        batch_speech = tf.placeholder(tf.float32, shape=(
            [None, INPUT_SEQ_LENGTH_SPEECH, INPUT_HEIGHT_SPEECH, INPUT_WIDTH_SPEECH, INPUT_CHANNELS_SPEECH]))

        # Label
        batch_labels = tf.placeholder(tf.uint8, (None, 1))
        margin_imp_tensor = tf.placeholder(tf.float32, ())

        ################################
        ## Feed forwarding to network ##
        ################################
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            with tf.device('/gpu:%d' % 0):
                with tf.name_scope('%s_%d' % ('tower', 0)) as scope:
                    """
                    Two distance metric are defined:
                       1 - distance_weighted: which is a weighted average of the distance between two structures.
                       2 - distance_l2: which is the regular l2-norm of the two networks outputs.
                    Place holders

                    """
                    ########################################
                    ######## Outputs of two networks #######
                    ########################################

                    logits_speech, end_points_speech = network_speech_fn(batch_speech)
                    logits_mouth, end_points_mouth = network_mouth_fn(batch_mouth)

                    # # Uncomment if the output embedding is desired to be as |f(x)| = 1
                    # logits_speech = tf.nn.l2_normalize(logits_speech, dim=1, epsilon=1e-12, name=None)
                    # logits_mouth = tf.nn.l2_normalize(logits_mouth, dim=1, epsilon=1e-12, name=None)

                    #################################################
                    ########### Loss Calculation ####################
                    #################################################

                    # ##### Weighted distance using a fully connected layer #####
                    # distance_vector = tf.subtract(logits_speech, logits_mouth,  name=None)
                    # distance_weighted = slim.fully_connected(distance_vector, 1, activation_fn=tf.nn.sigmoid,
                    #                                          normalizer_fn=None,
                    #                                          scope='fc_weighted')

                    ##### Euclidean distance ####
                    distance_l2 = tf.sqrt(
                        tf.reduce_sum(tf.pow(tf.subtract(logits_speech, logits_mouth), 2), 1, keep_dims=True))

                    ##### Contrastive loss ######
                    loss = losses.contrastive_loss(batch_labels, distance_l2, margin_imp=margin_imp_tensor,
                                                   scope=scope)

                    # ##### call the optimizer ######
                    # # TODO: call optimizer object outside of this gpu environment
                    #
                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Calculate the gradients for the batch of data on this CIFAR tower.
                    grads = opt.compute_gradients(loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)


        # Calculate the mean of each gradient.
        grads = average_gradients(tower_grads)

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Track the moving averages of all trainable variables.
        MOVING_AVERAGE_DECAY = 0.9999
        variable_averages = tf.train.ExponentialMovingAverage(
            MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        #################################################
        ########### Summary Section #####################
        #################################################

        # Gather initial summaries.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        # Add summaries for all end_points.
        for end_point in end_points_speech:
            x = end_points_speech[end_point]
            # summaries.add(tf.summary.histogram('activations_speech/' + end_point, x))
            summaries.add(tf.summary.scalar('sparsity_speech/' + end_point,
                                            tf.nn.zero_fraction(x)))

        for end_point in end_points_mouth:
            x = end_points_mouth[end_point]
            # summaries.add(tf.summary.histogram('activations_mouth/' + end_point, x))
            summaries.add(tf.summary.scalar('sparsity_mouth/' + end_point,
                                            tf.nn.zero_fraction(x)))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.summary.histogram(variable.op.name, variable))

        # Add to parameters to summaries
        summaries.add(tf.summary.scalar('learning_rate', learning_rate))
        summaries.add(tf.summary.scalar('eval/Loss', loss))
        summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

    ###########################
    ######## Training #########
    ###########################

    with tf.Session(graph=graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:

        # Initialization of the network.
        variables_to_restore = slim.get_variables_to_restore()
        saver = tf.train.Saver(variables_to_restore, max_to_keep=20)
        coord = tf.train.Coordinator()
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        # Restore the model
        print('Loading from:',FLAGS.checkpoint_dir)
        latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir=FLAGS.checkpoint_dir)
        saver.restore(sess, latest_checkpoint)

        # op to write logs to Tensorboard
        summary_writer = tf.summary.FileWriter(FLAGS.test_dir, graph=graph)

        ###################################################
        ############################ TEST  ################
        ###################################################
        score_dissimilarity_vector = np.zeros((FLAGS.batch_size * num_batches_per_epoch_test, 1))
        label_vector = np.zeros((FLAGS.batch_size * num_batches_per_epoch_test, 1))

        # Loop over all batches
        for i in range(num_batches_per_epoch_test):
            start_idx = i * FLAGS.batch_size
            end_idx = (i + 1) * FLAGS.batch_size
            speech_test, mouth_test, label_test = test_data['speech'][start_idx:end_idx], test_data['mouth'][
                                                                                          start_idx:end_idx], test_label[
                                                                                                              start_idx:end_idx]

            # # # Uncomment if standardalization is needed
            # # mean subtraction if necessary
            # speech_test = (speech_test - mean_speech) / std_speech
            # mouth_test = (mouth_test - mean_mouth) / std_mouth

            # Evaluation phase
            # WARNING: margin_imp_tensor has no effect here but it needs to be there because its tensor required a value to feed in!!
            loss_value, score_dissimilarity, _ = sess.run([loss, distance_l2, is_training],
                                                          feed_dict={is_training: False,
                                                                     margin_imp_tensor: 50,
                                                                     batch_speech: speech_test,
                                                                     batch_mouth: mouth_test,
                                                                     batch_labels: label_test.reshape(
                                                                         [FLAGS.batch_size, 1])})
            if (i + 1) % FLAGS.log_every_n_steps == 0:
                print("TESTING:" + ", Minibatch " + str(
                    i + 1) + " of %d " % num_batches_per_epoch_test)
            score_dissimilarity_vector[start_idx:end_idx] = score_dissimilarity
            label_vector[start_idx:end_idx] = label_test

        ##############################
        ##### K-fold validation ######
        ##############################
        K = 10
        EER = np.zeros((K, 1))
        AUC = np.zeros((K, 1))
        AP = np.zeros((K, 1))
        batch_k_validation = int(label_vector.shape[0] / float(K))

        for i in range(K):
            EER[i, :], AUC[i, :], AP[i, :], fpr, tpr = calculate_roc.calculate_eer_auc_ap(
                label_vector[i * batch_k_validation:(i + 1) * batch_k_validation],
                score_dissimilarity_vector[i * batch_k_validation:(i + 1) * batch_k_validation])

        # Printing Equal Error Rate(EER), Area Under the Curve(AUC) and Average Precision(AP)
        print("TESTING:" +", EER= " + str(np.mean(EER, axis=0)) + ", AUC= " + str(
            np.mean(AUC, axis=0)) + ", AP= " + str(np.mean(AP, axis=0)))


if __name__ == '__main__':
    tf.app.run()


# **Losses**

In [None]:
"""
Contrastive cost
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from tensorflow.contrib.framework.python.ops import add_arg_scope
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn
from tensorflow.python.ops import nn_ops
from tensorflow.python.util.deprecation import deprecated

import tensorflow as tf

# def contrastive_loss(onehot_labels, logits, margin=1, scope=None):
#     """With this definition the loss will be calculated.
#         Args:
#           y: The labels.
#           distance: The distance vector between the output features..
#           batch_size: the batch size is necessary because the loss calculation would be over each batch.
#         Returns:
#           The total loss.
#     """
#     with ops.name_scope(scope, "contrastive_loss", [onehot_labels, logits]) as scope:
#         # logits.get_shape().assert_is_compatible_with(onehot_labels.get_shape())
#
#         onehot_labels = math_ops.cast(onehot_labels, logits.dtype)
#
#         term_1 = tf.multiply(onehot_labels, tf.square(logits))[:,0:1]
#         term_2 = tf.multiply(onehot_labels, tf.square(tf.maximum((margin - logits), 0)))[:,1:]
#
#         # Contrastive
#         Contrastive_Loss = tf.add(term_1, term_2) / 2
#         loss = tf.losses.compute_weighted_loss(Contrastive_Loss, scope=scope)
#
#         return tf.losses.compute_weighted_loss(Contrastive_Loss, scope=scope)

def contrastive_loss(labels, logits, margin_gen=0, margin_imp=1, scope=None):
    """With this definition the loss will be calculated.
        Args:
          y: The labels.
          distance: The distance vector between the output features..
          batch_size: the batch size is necessary because the loss calculation would be over each batch.
        Returns:
          The total loss.
    """
    with ops.name_scope(scope, "contrastive_loss", [labels, logits]) as scope:
        # logits.get_shape().assert_is_compatible_with(onehot_labels.get_shape())

        labels = math_ops.cast(labels, logits.dtype)

        # term_1 = tf.multiply(labels, tf.square(logits))
        term_1 = tf.multiply(labels, tf.square(tf.maximum((logits - margin_gen), 0)))
        term_2 = tf.multiply(1 - labels, tf.square(tf.maximum((margin_imp - logits), 0)))

        # Contrastive
        Contrastive_Loss = tf.add(term_1, term_2) / 2
        loss = tf.losses.compute_weighted_loss(Contrastive_Loss, scope=scope)

        return loss


# def contrastive_loss(onehot_labels, logits, batch_size, margin=1):
#     """With this definition the loss will be calculated.
#         Args:
#           y: The labels.
#           distance: The distance vector between the output features..
#           batch_size: the batch size is necessary because the loss calculation would be over each batch.
#         Returns:
#           The total loss.
#     """
#     with ops.name_scope(scope, "contrastive_loss", [onehot_labels, logits]) as scope:
#         logits.get_shape().assert_is_compatible_with(onehot_labels.get_shape())
#
#         onehot_labels = math_ops.cast(onehot_labels, logits.dtype)
#
#         term_1 = tf.multiply(onehot_labels, tf.square(distance))[:,0:1]
#         term_2 = tf.multiply(onehot_labels, tf.square(tf.maximum((margin - distance), 0)))[:,1:]
#
#         # Contrastive
#         Contrastive_Loss = tf.add(term_1, term_2) / batch_size / 2
#         tf.add_to_collection('losses', Contrastive_Loss)
#
#         return tf.add_n(tf.get_collection('losses'), name='total_loss')



# **Lipread mouth**

In [None]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains model definitions for versions of the Oxford VGG network.

These model definitions were introduced in the following technical report:

  Very Deep Convolutional Networks For Large-Scale Image Recognition
  Karen Simonyan and Andrew Zisserman
  arXiv technical report, 2015
  PDF: http://arxiv.org/pdf/1409.1556.pdf
  ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
  CC-BY-4.0

More information can be obtained from the VGG website:
www.robots.ox.ac.uk/~vgg/research/very_deep/

Usage:
  with slim.arg_scope(vgg.vgg_arg_scope()):
    outputs, end_points = vgg.vgg_a(inputs)

  with slim.arg_scope(vgg.vgg_arg_scope()):
    outputs, end_points = vgg.vgg_16(inputs)

@@vgg_a
@@vgg_16
@@vgg_19
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

slim = tf.contrib.slim
LSTM_status = False


def lipread_mouth_arg_scope(is_training, weight_decay=0.0005):
  """Defines the VGG arg scope.

  Args:
    weight_decay: The l2 regularization coefficient.

  Returns:
    An arg_scope.
  """
  # Add normalizer_fn=slim.batch_norm if Batch Normalization is required!
  with slim.arg_scope([slim.conv3d, slim.fully_connected],
                      activation_fn=None,
                      weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG'),
                      weights_regularizer=slim.l2_regularizer(weight_decay),
                      normalizer_fn=slim.batch_norm,
                      biases_initializer=tf.zeros_initializer()):
    with slim.arg_scope([slim.conv3d], padding='SAME') as arg_sc:
      return arg_sc

def PReLU(input,scope):
  """
  Similar to TFlearn implementation
  :param input: input of the PReLU which is output of a layer.
  :return: The output.
  """
  alphas = tf.get_variable(scope, input.get_shape()[-1],
                       initializer=tf.constant_initializer(0.0),
                        dtype=tf.float32)

  return tf.nn.relu(input) + alphas * (input - abs(input)) * 0.5


def mouth_cnn_lstm(inputs,
          num_classes=1000,
          is_training=True,
          dropout_keep_prob=0.8,
          spatial_squeeze=True,
          scope='mouth_cnn'):
  """Oxford Net VGG 11-Layers version A Example.

  Note: All the fully_connected layers have been transformed to conv3d layers.
        To use in classification mode, resize input to 224x224.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.

  Returns:
    the last op containing the log predictions and end_points dict.
  """

  end_points = {}
  with tf.variable_scope(scope, 'mouth_cnn', [inputs]) as sc:
    # end_points_collection = sc.name + '_end_points'
    # # Collect outputs for conv3d, fully_connected and max_pool2d.
    # with slim.arg_scope([slim.conv3d, slim.max_pool2d],
    #                     outputs_collections=end_points_collection):

    ##### Convolution Section #####
    # Tensor("batch:1", shape=(?, 9, 60, 100, 1), dtype=float32, device=/device:CPU:0)
    inputs = tf.to_float(inputs)
    net = slim.repeat(inputs, 1, slim.conv3d, 16, [1, 3, 3], scope='conv1')
    net = PReLU(net, 'conv1_activation')
    net = tf.nn.max_pool3d(net, strides=[1, 1, 2, 2, 1], ksize=[1, 1, 3, 3, 1],padding='VALID', name='pool1')
    # net = slim.max_pool2d(net, [3, 3], scope='pool1')
    net = slim.repeat(net, 1, slim.conv3d, 32, [1, 3, 3], scope='conv2')
    net = PReLU(net, 'conv2_activation')
    net = tf.nn.max_pool3d(net, strides=[1, 1, 2, 2, 1], ksize=[1, 1, 3, 3, 1], padding='VALID', name='pool2')

    net = slim.conv3d(net, 64, [1, 3, 3], scope='conv31')
    net = PReLU(net, 'conv31_activation')
    net = slim.conv3d(net, 64, [1, 3, 3], scope='conv32')
    net = PReLU(net, 'conv32_activation')

    net = tf.nn.max_pool3d(net, strides=[1, 1, 2, 2, 1], ksize=[1, 1, 3, 3, 1], padding='VALID', name='pool3')
    net = slim.repeat(net, 1, slim.conv3d, 128, [1, 3, 3], scope='conv4')
    net = PReLU(net, 'conv4_activation')
    net = tf.nn.max_pool3d(net, strides=[1, 1, 2, 2, 1], ksize=[1, 1, 3, 3, 1], padding='VALID', name='pool4')

    ##### FC section #####
    # Use conv3d instead of fully_connected layers.
    net = slim.repeat(net, 1, slim.conv3d, 256, [1, 2, 5], padding='VALID', scope='fc5')
    net = PReLU(net, 'fc5_activation')
    # net = PReLU(net)
    # net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
    #                    scope='dropout5')


    if LSTM_status:

      net = slim.conv3d(net, 64, [1, 1, 1], padding='VALID', activation_fn=None, normalizer_fn=None, scope='fc6')
      net = PReLU(net, 'fc6_activation')

      # Tensor("tower_0/speech_cnn/fc6/squeezed:0", shape=(?, 9, 128), dtype=float32, device=/device:GPU:0)
      net = tf.squeeze(net, [2, 3], name='fc6/squeezed')

    else:
      net = slim.conv3d(net, 64, [9, 1, 1],padding='VALID', activation_fn=None, normalizer_fn=None, scope='fc5')

      # Tensor("tower_0/speech_cnn/fc6/squeezed:0", shape=(?, 9, 128), dtype=float32, device=/device:GPU:0)
      net = tf.squeeze(net, [1, 2, 3], name='fc6/squeezed')

    if LSTM_status:
      ##### LSTM-1 #####
      # use sequence_length=X_lengths argument in tf.nn.dynamic_rnn if necessary.
      cell_1 = tf.contrib.rnn.core_rnn_cell.LSTMCell(num_units=128, state_is_tuple=True)
      outputs, last_states = tf.nn.dynamic_rnn(
        cell=cell_1,
        dtype=tf.float32,
        inputs=net,
        scope='LSTM-mouth')
      net = last_states.h

    return net, end_points

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains model definitions for versions of the Oxford VGG network.

These model definitions were introduced in the following technical report:

  Very Deep Convolutional Networks For Large-Scale Image Recognition
  Karen Simonyan and Andrew Zisserman
  arXiv technical report, 2015
  PDF: http://arxiv.org/pdf/1409.1556.pdf
  ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
  CC-BY-4.0

More information can be obtained from the VGG website:
www.robots.ox.ac.uk/~vgg/research/very_deep/

Usage:
  with slim.arg_scope(vgg.vgg_arg_scope()):
    outputs, end_points = vgg.vgg_a(inputs)

  with slim.arg_scope(vgg.vgg_arg_scope()):
    outputs, end_points = vgg.vgg_16(inputs)

@@vgg_a
@@vgg_16
@@vgg_19
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import sys

slim = tf.contrib.slim
LSTM_status = False



def lipread_speech_arg_scope(is_training, weight_decay=0.0005,):
  """Defines the VGG arg scope.

  Args:
    weight_decay: The l2 regularization coefficient.

  Returns:
    An arg_scope.
  """
  # Add normalizer_fn=slim.batch_norm if Batch Normalization is required!
  with slim.arg_scope([slim.conv3d, slim.fully_connected],
                      activation_fn=None,
                      weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG'),
                      weights_regularizer=slim.l2_regularizer(weight_decay),
                      normalizer_fn=slim.batch_norm,
                      biases_initializer=tf.zeros_initializer()):
    with slim.arg_scope([slim.conv3d], padding='VALID') as arg_sc:
      return arg_sc

def PReLU(input,scope):
  """
  Similar to TFlearn implementation
  :param input: input of the PReLU which is output of a layer.
  :return: The output.
  """
  alphas = tf.get_variable(scope, input.get_shape()[-1],
                       initializer=tf.constant_initializer(0.0),
                        dtype=tf.float32)

  return tf.nn.relu(input) + alphas * (input - abs(input)) * 0.5

end_points = {}

def speech_cnn_lstm(inputs,
          num_classes=1000,
          is_training=True,
          dropout_keep_prob=0.8,
          spatial_squeeze=True,
          scope='speech_cnn'):
  """Oxford Net VGG 11-Layers version A Example.

  Note: All the fully_connected layers have been transformed to conv3d layers.
        To use in classification mode, resize input to 224x224.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.

  Returns:
    the last op containing the log predictions and end_points dict.
  """
  with tf.variable_scope(scope, 'speech_cnn', [inputs]) as sc:
    ##### CNN part #####
    # Tensor("batch:0", shape=(?, 15, 40, 1, 3), dtype=float32, device=/device:CPU:0)
    inputs = tf.to_float(inputs)
    net = slim.repeat(inputs, 1, slim.conv3d, 16, [1, 5, 1], scope='conv1')
    net = PReLU(net, 'conv1_activation')
    net = tf.nn.max_pool3d(net, strides=[1, 1, 2, 1, 1], ksize=[1, 1, 2, 1, 1], padding='VALID', name='pool1')

    net = slim.conv3d(net, 32, [1, 4, 1], scope='conv21')
    net = PReLU(net, 'conv21_activation')
    net = slim.conv3d(net, 32, [1, 4, 1], scope='conv22')
    net = PReLU(net, 'conv22_activation')
    net = tf.nn.max_pool3d(net, strides=[1, 1, 2, 1, 1], ksize=[1, 1, 2, 1, 1], padding='VALID', name='pool2')

    net = slim.conv3d(net, 64, [1, 3, 1], scope='conv31')
    net = PReLU(net, 'conv31_activation')
    net = slim.conv3d(net, 64, [1, 3, 1], scope='conv32')
    net = PReLU(net, 'conv32_activation')

    ##### FC part #####
    # Use conv3d instead of fully_connected layers.
    net = slim.conv3d(net, 128, [1, 2, 1], padding='VALID', scope='fc4')
    net = PReLU(net, 'fc4_activation')
    # net = PReLU(net)
    # net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
    #                    scope='dropout4')

    if LSTM_status:

      net = slim.conv3d(net, 64, [1, 1, 1], padding='VALID', activation_fn=None, normalizer_fn=None, scope='fc5')
      net = PReLU(net, 'fc5_activation')

      # Tensor("tower_0/speech_cnn/fc6/squeezed:0", shape=(?, 9, 128), dtype=float32, device=/device:GPU:0)
      net = tf.squeeze(net, [2, 3], name='fc5/squeezed')

    else:
      net = slim.conv3d(net, 64, [15, 1, 1],padding='VALID', activation_fn=None, normalizer_fn=None, scope='fc5')

      # Tensor("tower_0/speech_cnn/fc6/squeezed:0", shape=(?, 9, 128), dtype=float32, device=/device:GPU:0)
      net = tf.squeeze(net, [1, 2, 3], name='fc5/squeezed')

    if LSTM_status:
      ##### LSTM-1 #####
      # use sequence_length=X_lengths argument in tf.nn.dynamic_rnn if necessary.
      cell_1 = tf.contrib.rnn.core_rnn_cell.LSTMCell(num_units=128, state_is_tuple=True)
      outputs, last_states = tf.nn.dynamic_rnn(
        cell=cell_1,
        dtype=tf.float32,
        inputs=net,
        scope='LSTM-speech')
      net = last_states.h

    return net, end_points

# **Lipread Speech**

In [None]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains model definitions for versions of the Oxford VGG network.

These model definitions were introduced in the following technical report:

  Very Deep Convolutional Networks For Large-Scale Image Recognition
  Karen Simonyan and Andrew Zisserman
  arXiv technical report, 2015
  PDF: http://arxiv.org/pdf/1409.1556.pdf
  ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
  CC-BY-4.0

More information can be obtained from the VGG website:
www.robots.ox.ac.uk/~vgg/research/very_deep/

Usage:
  with slim.arg_scope(vgg.vgg_arg_scope()):
    outputs, end_points = vgg.vgg_a(inputs)

  with slim.arg_scope(vgg.vgg_arg_scope()):
    outputs, end_points = vgg.vgg_16(inputs)

@@vgg_a
@@vgg_16
@@vgg_19
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import sys

slim = tf.contrib.slim
LSTM_status = False



def lipread_speech_arg_scope(is_training, weight_decay=0.0005,):
  """Defines the VGG arg scope.

  Args:
    weight_decay: The l2 regularization coefficient.

  Returns:
    An arg_scope.
  """
  # Add normalizer_fn=slim.batch_norm if Batch Normalization is required!
  with slim.arg_scope([slim.conv3d, slim.fully_connected],
                      activation_fn=None,
                      weights_initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG'),
                      weights_regularizer=slim.l2_regularizer(weight_decay),
                      normalizer_fn=slim.batch_norm,
                      biases_initializer=tf.zeros_initializer()):
    with slim.arg_scope([slim.conv3d], padding='VALID') as arg_sc:
      return arg_sc

def PReLU(input,scope):
  """
  Similar to TFlearn implementation
  :param input: input of the PReLU which is output of a layer.
  :return: The output.
  """
  alphas = tf.get_variable(scope, input.get_shape()[-1],
                       initializer=tf.constant_initializer(0.0),
                        dtype=tf.float32)

  return tf.nn.relu(input) + alphas * (input - abs(input)) * 0.5

end_points = {}

def speech_cnn_lstm(inputs,
          num_classes=1000,
          is_training=True,
          dropout_keep_prob=0.8,
          spatial_squeeze=True,
          scope='speech_cnn'):
  """Oxford Net VGG 11-Layers version A Example.

  Note: All the fully_connected layers have been transformed to conv3d layers.
        To use in classification mode, resize input to 224x224.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.

  Returns:
    the last op containing the log predictions and end_points dict.
  """
  with tf.variable_scope(scope, 'speech_cnn', [inputs]) as sc:
    ##### CNN part #####
    # Tensor("batch:0", shape=(?, 15, 40, 1, 3), dtype=float32, device=/device:CPU:0)
    inputs = tf.to_float(inputs)
    net = slim.repeat(inputs, 1, slim.conv3d, 16, [1, 5, 1], scope='conv1')
    net = PReLU(net, 'conv1_activation')
    net = tf.nn.max_pool3d(net, strides=[1, 1, 2, 1, 1], ksize=[1, 1, 2, 1, 1], padding='VALID', name='pool1')

    net = slim.conv3d(net, 32, [1, 4, 1], scope='conv21')
    net = PReLU(net, 'conv21_activation')
    net = slim.conv3d(net, 32, [1, 4, 1], scope='conv22')
    net = PReLU(net, 'conv22_activation')
    net = tf.nn.max_pool3d(net, strides=[1, 1, 2, 1, 1], ksize=[1, 1, 2, 1, 1], padding='VALID', name='pool2')

    net = slim.conv3d(net, 64, [1, 3, 1], scope='conv31')
    net = PReLU(net, 'conv31_activation')
    net = slim.conv3d(net, 64, [1, 3, 1], scope='conv32')
    net = PReLU(net, 'conv32_activation')

    ##### FC part #####
    # Use conv3d instead of fully_connected layers.
    net = slim.conv3d(net, 128, [1, 2, 1], padding='VALID', scope='fc4')
    net = PReLU(net, 'fc4_activation')
    # net = PReLU(net)
    # net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
    #                    scope='dropout4')

    if LSTM_status:

      net = slim.conv3d(net, 64, [1, 1, 1], padding='VALID', activation_fn=None, normalizer_fn=None, scope='fc5')
      net = PReLU(net, 'fc5_activation')

      # Tensor("tower_0/speech_cnn/fc6/squeezed:0", shape=(?, 9, 128), dtype=float32, device=/device:GPU:0)
      net = tf.squeeze(net, [2, 3], name='fc5/squeezed')

    else:
      net = slim.conv3d(net, 64, [15, 1, 1],padding='VALID', activation_fn=None, normalizer_fn=None, scope='fc5')

      # Tensor("tower_0/speech_cnn/fc6/squeezed:0", shape=(?, 9, 128), dtype=float32, device=/device:GPU:0)
      net = tf.squeeze(net, [1, 2, 3], name='fc5/squeezed')

    if LSTM_status:
      ##### LSTM-1 #####
      # use sequence_length=X_lengths argument in tf.nn.dynamic_rnn if necessary.
      cell_1 = tf.contrib.rnn.core_rnn_cell.LSTMCell(num_units=128, state_is_tuple=True)
      outputs, last_states = tf.nn.dynamic_rnn(
        cell=cell_1,
        dtype=tf.float32,
        inputs=net,
        scope='LSTM-speech')
      net = last_states.h

    return net, end_points



# **Nets factory**

In [None]:
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains a factory for building various models."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools

import tensorflow as tf
from nets import lipread_mouth
from nets import lipread_speech

slim = tf.contrib.slim

networks_map = {'lipread_mouth':lipread_mouth.mouth_cnn_lstm,
                'lipread_speech':lipread_speech.speech_cnn_lstm,

               }

arg_scopes_map = {'lipread_mouth':lipread_mouth.lipread_mouth_arg_scope,
                  'lipread_speech':lipread_speech.lipread_speech_arg_scope,
                 }


def get_network_fn(name, num_classes, weight_decay=0.0, is_training=False):
  """Returns a network_fn such as `logits, end_points = network_fn(images)`.

  Args:
    name: The name of the network.
    num_classes: The number of classes to use for classification.
    weight_decay: The l2 coefficient for the model weights.
    is_training: `True` if the model is being used for training and `False`
      otherwise.

  Returns:
    network_fn: A function that applies the model to a batch of images. It has
      the following signature:
        logits, end_points = network_fn(images)
  Raises:
    ValueError: If network `name` is not recognized.
  """
  if name not in networks_map:
    raise ValueError('Name of network unknown %s' % name)

  func = networks_map[name]
  @functools.wraps(func)
  def network_fn(images):
    arg_scope = arg_scopes_map[name](is_training, weight_decay=weight_decay)
    with slim.arg_scope(arg_scope):
      return func(images, num_classes, is_training=is_training)
  if hasattr(func, 'default_image_size'):
    network_fn.default_image_size = func.default_image_size

  return network_fn


# **Siamese Architecture for face recognition**

In [None]:
# Siamese Architecture for face recognition

import random
import numpy as np
import time
import tensorflow as tf
import math
import pdb
import sys
import scipy.io as sio
from sklearn import *

def calculate_eer_auc_ap(label,distance):

    fpr, tpr, thresholds = metrics.roc_curve(label, -distance, pos_label=1)
    AUC = metrics.roc_auc_score(label, -distance, average='macro', sample_weight=None)
    AP = metrics.average_precision_score(label, -distance, average='macro', sample_weight=None)

    # Calculating EER
    intersect_x = fpr[np.abs(fpr - (1 - tpr)).argmin(0)]
    EER = intersect_x

    return EER,AUC,AP,fpr, tpr



In [None]:
# Siamese Architecture for face recognition

import random
import numpy as np
import time
import tensorflow as tf
import math
import pdb
import sys
import scipy.io as sio
from sklearn import *
import matplotlib.pyplot as plt

def Plot_HIST_Fn(label,distance, phase, num_bins = 50):

    dissimilarity = distance[:]
    gen_dissimilarity_original = []
    imp_dissimilarity_original = []
    for i in range(len(label)):
        if label[i] == 1:
            gen_dissimilarity_original.append(dissimilarity[i])
        else:
            imp_dissimilarity_original.append(dissimilarity[i])

    bins = np.linspace(np.amin(distance), np.amax(distance), num_bins)
    fig = plt.figure()
    plt.hist(gen_dissimilarity_original, bins, alpha=0.5, facecolor='blue', normed=False, label='gen_dist_original')
    plt.hist(imp_dissimilarity_original, bins, alpha=0.5, facecolor='red', normed=False, label='imp_dist_original')
    plt.legend(loc='upper right')
    plt.title(phase + '_' + 'OriginalFeatures_Histogram.jpg')
    plt.show()
    fig.savefig(phase + '_' + 'OriginalFeatures_Histogram.jpg')


In [None]:
# Siamese Architecture for face recognition

import random
import numpy as np
import time
import tensorflow as tf
import math
import pdb
import sys
import scipy.io as sio
from sklearn import *
import matplotlib.pyplot as plt

def Plot_PR_Fn(label,distance,phase):

    precision, recall, thresholds = metrics.precision_recall_curve(label, -distance, pos_label=1, sample_weight=None)
    AP = metrics.average_precision_score(label, -distance, average='macro', sample_weight=None)

    # AP(average precision) calculation.
    # This score corresponds to the area under the precision-recall curve.
    print("AP = ", float(("{0:.%ie}" % 1).format(AP)))

    # Plot the ROC
    fig = plt.figure()
    ax = fig.gca()
    lines = plt.plot(recall, precision, label='ROC Curve')
    plt.setp(lines, linewidth=2, color='r')
    ax.set_xticks(np.arange(0, 1.1, 0.1))
    ax.set_yticks(np.arange(0, 1.1, 0.1))
    plt.title(phase + '_' + 'PR.jpg')
    plt.xlabel('Recall')
    plt.ylabel('Precision')

    # Cutting the floating number
    AP = '%.2f' % AP

    # Setting text to plot
    # plt.text(0.5, 0.5, 'AP = ' + str(AP), fontdict=None)
    plt.grid()
    plt.show()
    fig.savefig(phase + '_' + 'PR.jpg')



In [None]:
# Siamese Architecture for face recognition

import random
import numpy as np
import time
import tensorflow as tf
import math
import pdb
import sys
import scipy.io as sio
from sklearn import *
import matplotlib.pyplot as plt


def Plot_ROC_Fn(label,distance,phase):

    fpr, tpr, thresholds = metrics.roc_curve(label, -distance, pos_label=1)
    AUC = metrics.roc_auc_score(label, -distance, average='macro', sample_weight=None)
    # AP = metrics.average_precision_score(label, -distance, average='macro', sample_weight=None)

    # Calculating EER
    intersect_x = fpr[np.abs(fpr - (1 - tpr)).argmin(0)]
    EER = intersect_x
    print("EER = ", float(("{0:.%ie}" % 1).format(intersect_x)))

    # AUC(area under the curve) calculation
    print("AUC = ", float(("{0:.%ie}" % 1).format(AUC)))

    # # AP(average precision) calculation.
    # # This score corresponds to the area under the precision-recall curve.
    # print("AP = ", float(("{0:.%ie}" % 1).format(AP)))

    # Plot the ROC
    fig = plt.figure()
    ax = fig.gca()
    lines = plt.plot(fpr, tpr, label='ROC Curve')
    plt.setp(lines, linewidth=2, color='r')
    ax.set_xticks(np.arange(0, 1.1, 0.1))
    ax.set_yticks(np.arange(0, 1.1, 0.1))
    plt.title(phase + '_' + 'ROC.jpg')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    # # Cutting the floating number
    # AUC = '%.2f' % AUC
    # EER = '%.2f' % EER
    # # AP = '%.2f' % AP
    #
    # # Setting text to plot
    # # plt.text(0.5, 0.6, 'AP = ' + str(AP), fontdict=None)
    # plt.text(0.5, 0.5, 'AUC = ' + str(AUC), fontdict=None)
    # plt.text(0.5, 0.4, 'EER = ' + str(EER), fontdict=None)
    plt.grid()
    plt.show()
    fig.savefig(phase + '_' + 'ROC.jpg')