In [1]:
import tensorflow as tf

import numpy as np
import pathlib
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt

import multiprocessing
import tables
import tqdm
import math

from keras.preprocessing.sequence import pad_sequences
from keras_radam.training import RAdamOptimizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_distances

from transformer.model import transformer as transformer_main
from transformer.model import model_params

from IPython.core.display import display, HTML

#3rd party tool for exporting best models on validation
from best_checkpoint_copier import *

from aggregator_utils.helpers import *
from aggregator_utils.losses import *
from aggregator_utils import config 
from aggregator_utils.plotting_utlis import *
from aggregator_utils.aggregator import model_fn_transformer

tf.enable_eager_execution()
tf.logging.set_verbosity(tf.logging.INFO)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'


display(HTML("<style>.container { width:100% !important; }</style>"))

Using TensorFlow backend.
W1122 14:05:37.809302 140390855927680 deprecation_wrapper.py:119] From /home/igorprotsenko/utils/models/official/transformer/model/attention_layer.py:24: The name tf.layers.Layer is deprecated. Please use tf.compat.v1.layers.Layer instead.



# Configuration setting

In [2]:
from transformer.model import transformer as transformer_main
from transformer.model import model_params

encoder_params = model_params.BASE_PARAMS
aggregator_params = config.MIRSAAN_PARAMS
batching_params = config.MULTI_SAMPLER_PARAMS

In [6]:
encoder_params['num_hidden_layers'] = aggregator_params["encoder_num_hidden_layers"]
encoder_params['return_attention_scores'] = aggregator_params["encoder_return_attention_scores"]
encoder_params['attention_dropout']= aggregator_params["encoder_self_attention_dropout"]
encoder_params['relu_dropout'] = aggregator_params["encoder_relu_dropout"]
encoder_params['hidden_size'] = aggregator_params["embeddings_dim"]
encoder_params['use_positional_encoding']= aggregator_params["encoder_use_positional_encoding"]

In [7]:
MULTIIDENTITY_PATH = pathlib.Path('../../datasets/multiIdentity/annotations/')
MANNOTATIONS_PATH = MULTIIDENTITY_PATH/'unioned_annotations.clsv'
MEMBEDDINGS_PATH = MULTIIDENTITY_PATH/'unioned.hdf5'

In [8]:
metadata = pd.read_csv(MANNOTATIONS_PATH,
                       index_col='index')

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [9]:
with tables.open_file(MEMBEDDINGS_PATH, 'r') as file:
    data = file.root.data[:]

In [10]:
metadata['dataset_namespace'] =  metadata.file_path.str.split('/').str[3]
metadata.session_id = metadata.session_id.astype(str)

In [11]:
stats = metadata.groupby('identity_name')\
               .agg({'session_id':'nunique'})

identities_filtered = stats[stats.session_id>=2]\
                                         .index

In [12]:
metadata = metadata[metadata.identity_name.isin(identities_filtered)]
unique_identities = metadata.identity_name.unique()

In [13]:
train_identities, test_identities = train_test_split(unique_identities, test_size=.1, random_state=42)

In [14]:
NUM_TRAIN_CLASSES = len(train_identities)

In [15]:
train_names_pd = pd.DataFrame(train_identities,
                              columns=['identity_name'])
eval_names_pd = pd.DataFrame(test_identities,
                             columns=['identity_name'])

for df in [train_names_pd, eval_names_pd]:
    df['identity_reference'] = df['identity_name'].astype('category').cat.codes

eval_names_pd['identity_reference'] += len(train_identities)
names_pd = pd.concat((train_names_pd, eval_names_pd))

In [16]:
metadata = metadata.reset_index()\
                   .merge(names_pd, on='identity_name', how="left")\
                   .set_index('index')

In [17]:
sessions_stats = metadata.groupby(['identity_name','session_id'])\
                         .agg({'file_path':'count'})\
                         .reset_index()

In [18]:
train_sessions_stats = sessions_stats[sessions_stats.identity_name.isin(train_identities)]
eval_session_stats = sessions_stats[sessions_stats.identity_name.isin(test_identities)]

# Helper functions to create multi-identity video

In [23]:
def sample_sessions_per_identity(stats,
                                 num_identities = batching_params['identities_limit'],
                                 num_frames = batching_params['generated_session_len'],
                                 metadata = metadata): 
    
    num_identities = np.random.randint(1,
                                       num_identities)
    identity_lim = np.ceil(num_frames/num_identities)

    
    sessions_filtered = stats[stats.file_path>=identity_lim]
    
    sample_identities = np.random.choice(sessions_filtered.identity_name.unique(),
                                         size=num_identities,
                                         replace=False)
    
    filtered_sessions = sessions_filtered[sessions_filtered.identity_name.isin(sample_identities)]
    
    sampled_sessions = filtered_sessions.groupby('identity_name',
                                                 group_keys=False).apply(lambda x: x.sample(1))
    
    retreived_indecies = metadata[metadata.session_id.isin(sampled_sessions.session_id)]
        
    retreived_indecies = retreived_indecies.sample(num_frames).sort_index()
    
    retreived_indecies["position"] = retreived_indecies.groupby('identity_name', 
                                                                group_keys=False).cumcount()
    
    return retreived_indecies.sample(num_frames).sort_values(['identity_name','position'])

In [24]:
def generate_batch(stats, data=data):
    while True:
        video_sessions = sample_sessions_per_identity(stats)
        embeddings = data[video_sessions.index.values]
        labels = video_sessions.identity_reference.values
        file_path = video_sessions.file_path.values
        label_mask = generate_flattened_label_mask(labels)
        yield {"embeddings":embeddings,
               "label_mask":label_mask,
               "file_path":file_path,
               }, labels.astype(np.int32)

In [25]:
#  (tf.float32,tf.int32, tf.uint8),
def input_fn(gen,
             video_length,
             embeddings_dim):
    binary_mask_size = video_length*(video_length-1)/2
    return tf.data.Dataset.from_generator(lambda : gen,
                               output_types=({'embeddings': tf.float32, 'label_mask':tf.uint8, 'file_path':tf.string}, tf.int32),      
                               output_shapes=({'embeddings': (video_length, embeddings_dim),
                                               'label_mask': (binary_mask_size),
                                               'file_path':(video_length)},video_length)) 

In [26]:
LOGS_PATH = pathlib.Path("../../logs")
MODEL_DIR = LOGS_PATH/"misaan"

# Multi-identity model function

In [31]:
def misaan_model_fn(features, labels, mode, params):
    """
    Model function to implement Estimator API
    """
    is_training = (mode == tf.contrib.learn.ModeKeys.TRAIN)
    is_eval = (mode == tf.contrib.learn.ModeKeys.EVAL)

    if type(features) is dict:
        embeddings = features.get("embeddings")
        binary_mask = features.get("label_mask")
        file_pathes = features.get("file_path")
        binarization_threshold = features.get('binarization_threshold',
                                              aggregator_params['binarization_threshold'])
        
    with tf.name_scope("misaan_body"):
        
        if is_training or is_eval :
        # -- This part is valid durint EVAL and TRAIN modes --- #
            binary_mask_full = tf_flattened_vector_to_mask(binary_mask, 
                                                           num_elements=tf.shape(embeddings)[0])

            binary_mask_full = tf.cast(binary_mask_full, tf.int32)
        else:
            binary_mask_full = None
    

        classes_predicted_raw = tf_rule_based_class_predictor(embeddings,
                                                              distance_threshold=binarization_threshold)
    
        classes_predicted = tf_flattened_vector_to_mask(classes_predicted_raw, 
                                                       num_elements= tf.shape(embeddings)[0])
         
        with tf.name_scope("mask_mixing"):
            
            mask_mixed, probability_mix = componentwise_mask_mixing(binary_mask_full,
                                                   classes_predicted,
                                                   is_training)


        with tf.name_scope("mask_postprocessing"):
            
            norm_logits = tf.linalg.norm(embeddings,
                                         axis=1)
    
            indecies_unique, mask_unique = tf_greedy_search_for_connected_components(mask_mixed,
                                                                                     norm_logits)
    
            gt_mask_unique_rows = tf.gather(mask_unique, 
                                            indecies_unique)
            
            face_tracks = convert_mask_to_face_tracks(embeddings, 
                                                      gt_mask_unique_rows)
            
            gt_mask_unique_rows = tf.reshape(gt_mask_unique_rows,
                                             (tf.shape(indecies_unique)[0],
                                              tf.shape(embeddings)[0]))
                           
                
            clique_size = tf.count_nonzero(gt_mask_unique_rows,
                                           axis=1)
            
#             # assert that we have only fully connected components
#              assert tf.reduce_all(tf.equal(tf.reduce_sum(gt_mask_unique_rows, axis=0),1))
            
            
            num_identities_predicted = tf.size(indecies_unique)
            
            
            
        with tf.name_scope("aggregation"):
            
            aggregated_embeddings, softmax_attention_values  = model_fn_transformer(face_tracks,
                                                                      is_training,
                                                                      encoder_params,
                                                                      soft_attention = not aggregator_params['sparse_attention'],
                                                                      component_wise = aggregator_params['component_wise_attention'])
            
            softmax_attention_values = tf.cast(softmax_attention_values,
                                               tf.float64)
            
            aggregated_embeddings = tf.cast(aggregated_embeddings,
                                            tf.float32)
            
            
            
    if mode == tf.estimator.ModeKeys.PREDICT:

        predictions = {'aggregated_embeddings': aggregated_embeddings,
#                        'binary_mask':classes_predicted,
                       'binary_mask':gt_mask_unique_rows,  
                       'attention_distribution': softmax_attention_values,
                       'num_identities':num_identities_predicted,
                       'clique_size': clique_size}

        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
            

    labels_gt = tf.gather(labels, indecies_unique)
    num_identities_real = tf.size(tf.unique(labels)[0]) 

    if mode == tf.estimator.ModeKeys.EVAL:

        classes_predicted_for_acc = classes_predicted_raw
        
        metrics_ops = {'mask_accuracy':tf.metrics.accuracy(binary_mask,
                                                            classes_predicted_for_acc),
                       'mask_recall': tf.metrics.recall(binary_mask, 
                                                        classes_predicted_for_acc),
                       
                       'mask_precision': tf.metrics.precision(binary_mask,
                                                              classes_predicted_for_acc),
                       
                       'num_identities_mae' : tf.metrics.mean_absolute_error(num_identities_real,
                                                                             num_identities_predicted),
                       'num_identities_predicted': tf.contrib.metrics.streaming_mean(num_identities_predicted),
                       'num_identities_real':tf.contrib.metrics.streaming_mean(num_identities_real),
                       }
        
     
        eval_loss = tf.constant(0.)

        evaluation_hooks = None

        return tf.estimator.EstimatorSpec(mode, loss=eval_loss,
                                          eval_metric_ops = metrics_ops,
                                          evaluation_hooks = evaluation_hooks)

    if mode == tf.contrib.learn.ModeKeys.TRAIN:       
        
        with tf.name_scope("aggregation_loss"):
            
            arcface_logits, positive_angles, negative_angles = get_arcface_logits(aggregated_embeddings,
                                            labels_gt,
                                            NUM_TRAIN_CLASSES,
                                            s=aggregator_params['arcface_radius'],
                                            m=aggregator_params['arcface_margin'])
                                                                                 
            
            aggregation_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = labels_gt,
                                                                    logits = arcface_logits)
            
            aggregation_loss = tf.reduce_mean(aggregation_loss)
            
        loss_overall = aggregation_loss
 
        tf.summary.scalar('aggregation_loss', aggregation_loss)
        tf.summary.scalar('teacher_forcing_prob', probability_mix)
    
        tf.summary.histogram('train_positive_angles', positive_angles)      
        tf.summary.histogram('train_negative_angles', negative_angles)
        tf.summary.histogram('num_identities_real', num_identities_real)
        tf.summary.histogram('num_identities_predicted', num_identities_predicted)
        tf.summary.histogram('predicted_class_distribution', classes_predicted)
        tf.summary.histogram('real_class_distribution', binary_mask)
        
        optimizer = RAdamOptimizer(learning_rate = 1e-3)
        
        train_op = optimizer.minimize(
                loss_overall, global_step=tf.train.get_global_step())

        return tf.estimator.EstimatorSpec(mode, loss=loss_overall, train_op=train_op)

In [32]:
def serving_input_receiver_fn(embeddings_dims):
    input_images = tf.placeholder(dtype=tf.float32,
                                     shape = embeddings_dims,
                                     name='input_embeddings')
    
    binarization_threshold = tf.placeholder(dtype=tf.float32,
                                           shape=(),
                                           name='binarization_threshold'
                                           )
    # here you do all the operations you need on the images before they can be fed to the net (e.g., normalizing, reshaping, etc). Let's assume "images" is the resulting tensor.
    features = {'embeddings' : input_images, 
                'binarization_threshold':binarization_threshold} # this is the dict that is then passed as "features" parameter to your model_fn
    receiver_tensors = {'embeddings': input_images,
                        'binarization_threshold':binarization_threshold} # As far as I understand this is needed to map the input to a name you can retrieve later
    return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)

# Training configurations

In [33]:
run_config = tf.estimator.RunConfig(
    model_dir=MODEL_DIR,
    tf_random_seed=42,
    save_summary_steps = 50,
    save_checkpoints_steps = 250)


train_generator = lambda : input_fn(generate_batch(train_sessions_stats),
                           batching_params['generated_session_len'],
                           aggregator_params['embeddings_dim'])\
                          .prefetch(batching_params['prefetch_buffer'])

eval_generator = lambda : input_fn(generate_batch(eval_session_stats),
                           batching_params['generated_session_len'],
                           aggregator_params['embeddings_dim'])\
                          .prefetch(batching_params['prefetch_buffer'])
                
    
train_spec = tf.estimator.TrainSpec(input_fn= train_generator, max_steps = 10000)

serv_func = lambda : serving_input_receiver_fn(embeddings_dims = (None, aggregator_params['embeddings_dim']))

best_exporter = BestCheckpointCopier(
   name='best_misaan', # directory within model directory to copy checkpoints to
   checkpoints_to_keep=1, # number of checkpoints to keep
   score_metric='num_identities_mae', # metric to use to determine "best"
   compare_fn=lambda x,y: x.score > y.score, # comparison function used to determine "best" checkpoint (x is the current checkpoint; y is the previously copied checkpoint with the highest/worst score)
   sort_key_fn=lambda x: x.score,
   sort_reverse=True)

final_exporter = tf.estimator.LatestExporter(
    name='misaan_exporter', serving_input_receiver_fn=serv_func,
    exports_to_keep=20)

exporters = (best_exporter, final_exporter)


eval_spec = tf.estimator.EvalSpec(input_fn= eval_generator,
                                 steps= 100,
                                 throttle_secs=30,
                                 exporters = exporters)

misaan_estimator = tf.estimator.Estimator(
    model_fn = misaan_model_fn,
    config = run_config,
    params = None)

I1122 14:08:40.383785 140390855927680 estimator.py:209] Using config: {'_model_dir': '../../logs/misaan', '_tf_random_seed': 42, '_save_summary_steps': 50, '_save_checkpoints_steps': 250, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7faddf796438>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
W1122 14:08:40.385029 140390855927680 model_fn.py:630] Estimator's model_fn (<function misaan_mo

In [None]:
tf.estimator.train_and_evaluate(misaan_estimator, train_spec, eval_spec)