# Particle Categorical

In [1]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

import tensorflow.keras as keras
import tensorflow.keras.backend as K
# from tensorflow.keras.layers import Input, Dense, Activation, BatchNormalization
# from tensorflow.keras.layers import Conv1D
# from tensorflow.keras.layers import Flatten, Reshape, Lambda
# from tensorflow.keras.utils import plot_model
# from tensorflow.keras import Model

import os
import os.path as osp
import sys

import numpy as np
#from scipy import linalg as LA

import matplotlib
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

from utils.tf_sinkhorn import ground_distance_tf_nograd, sinkhorn_knopp_tf_scaling_stabilized_class
import utils.VAE_model_tools
from utils.VAE_model_tools import build_and_compile_annealing_vae, betaVAEModel, reset_metrics

import pandas
import matplotlib.pyplot as plt

import h5py
import pickle


1 Physical GPUs, 1 Logical GPUs


In [2]:
def create_dir(dir_path):
    ''' Creates a directory (or nested directories) if they don't exist.
    '''
    if not osp.exists(dir_path):
        os.makedirs(dir_path)

    return dir_path

output_dir = './data/'

## Generate training data

Input data is 2D, generated in the shape of a banana (plotted below) defined by two gaussians with widths 1 and 0.1. The VAE will be tasked with reconstructing the 2D location of the input points, using euclidean distance as the reconstruction error. The latent space is 2D, so can in principle easily encode everything about the input. This will be regulated by the variational latent space

In [3]:
# path to file
fn =  '/home/jcollins/projects/EMD_VAE/in_data/monoW-data.h5'

In [4]:
# Option 1: Load everything into memory
df = pandas.read_hdf(fn,stop=1000000)
print(df.shape)
print("Memory in GB:",sum(df.memory_usage(deep=True)) / (1024**3)+sum(df.memory_usage(deep=True)) / (1024**3))

(1000000, 150)
Memory in GB: 2.250075340270996


In [5]:
data = df.values.reshape((-1,50,3))

HT = np.sum(data[:,:,0],axis=-1)
data[:,:,0] = data[:,:,0]/HT[:,None]

sig_input = np.zeros((len(data),50,5))
sig_input[:,:,:2] = data[:,:,:2]
sig_input[:,:,2] = np.cos(data[:,:,-1])
sig_input[:,:,3] = np.sin(data[:,:,-1])
sig_input[:,:,4] = np.log(data[:,:,0]+1e-8)

data_x = sig_input
data_y = data


train_x = data_x[:300000]
train_y = data_y[:300000]
valid_x = data_x[300000:400000]
valid_y = data_y[300000:400000]

  data[:,:,0] = data[:,:,0]/HT[:,None]


In [12]:
experiment_name = 'W-test'
train_output_dir = create_dir(osp.join(output_dir, experiment_name))
vae, encoder, decoder = build_and_compile_annealing_vae(optimizer=keras.optimizers.Adam(lr=0.001,clipnorm=0.1),
                                    encoder_conv_layers = [512,512,512,512],
                                    dense_size = [512,512,512,512],
                                    decoder = [512,512,512,512],
                                    numItermaxinner = 10,
                                    numIter=10,
                                    reg_init = 1.,
                                    reg_final = 0.01,
                                    stopThr=1e-3,
                                    num_inputs=5,
                                    num_particles_in=50)

batch_size=100
save_period=2

reduceLR = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=1e-4, cooldown=0, min_lr=0)
modelcheckpoint = keras.callbacks.ModelCheckpoint(train_output_dir + '/model_weights_{epoch:02d}.hdf5', save_freq = save_period*5000, save_weights_only=True)
reset_metrics_inst = reset_metrics()

callbacks=[tf.keras.callbacks.CSVLogger(train_output_dir + '/log.csv', separator=",", append=True),
            reduceLR,
            modelcheckpoint,
            reset_metrics_inst]

Model: "VAE"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, 50, 5)]      0                                            
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 50, 512)      3072        inputs[0][0]                     
__________________________________________________________________________________________________
re_lu_24 (ReLU)                 (None, 50, 512)      0           conv1d_8[0][0]                   
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 50, 512)      262656      re_lu_24[0][0]                   
________________________________________________________________________________________________

In [13]:
encoder(train_x[:10])

[<tf.Tensor: shape=(10, 128), dtype=float32, numpy=
 array([[-1.9130168 ,  4.810379  , -1.438453  , ...,  2.8302999 ,
          1.9264292 ,  3.5940464 ],
        [-1.9889731 ,  4.472971  , -1.3010547 , ...,  2.5869153 ,
          1.6785284 ,  3.4286377 ],
        [-1.8745244 ,  4.7355375 , -2.0221474 , ...,  2.5585344 ,
          0.6960923 ,  3.0919118 ],
        ...,
        [-2.045337  ,  5.587791  , -2.5175946 , ...,  2.9405284 ,
          0.8171    ,  3.5535364 ],
        [-1.9019165 ,  5.482372  , -2.4579124 , ...,  2.7703192 ,
          0.8760087 ,  3.499468  ],
        [-1.5380244 ,  2.8940487 , -0.65773904, ...,  1.8671614 ,
          1.1265392 ,  2.7047758 ]], dtype=float32)>,
 <tf.Tensor: shape=(10, 128), dtype=float32, numpy=
 array([[ 0.33607703, -2.8207383 ,  0.9409974 , ..., -4.487803  ,
          0.6027744 , -0.18636078],
        [ 0.29102552, -2.637267  ,  0.8082115 , ..., -4.1327386 ,
          0.3605845 , -0.2608899 ],
        [-0.10659985, -2.7913814 ,  1.2643719 , .

In [14]:
batch_size=100
save_period=2

reduceLR = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=1e-4, cooldown=0, min_lr=0)
modelcheckpoint = keras.callbacks.ModelCheckpoint(train_output_dir + '/model_weights_{epoch:02d}.hdf5', save_freq = save_period*5000, save_weights_only=True)
reset_metrics_inst = reset_metrics()

callbacks=[tf.keras.callbacks.CSVLogger(train_output_dir + '/log.csv', separator=",", append=True),
            reduceLR,
            modelcheckpoint,
            reset_metrics_inst]

vae.beta.assign(0.01)
numbatches = 5000

K.set_value(vae.optimizer.lr,1e-4)
epochs = 1000


history = vae.fit(x=train_x[:numbatches*batch_size], y=train_y[:numbatches*batch_size], batch_size=batch_size,
                epochs=epochs,verbose=1,#initial_epoch=int(vae.optimizer.iterations/numbatches),
                validation_data = (valid_x[:10*batch_size],valid_y[:10*batch_size]),
                callbacks = callbacks
              )

# tf.saved_model.save(vae, train_output_dir + '/mymodel.hdf5')

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
 200/3000 [=>............................] - ETA: 18:25 - loss: 142.9666 - recon_loss: 0.0115 - KL loss: 27.5043 - beta: 0.0100

KeyboardInterrupt: 

In [17]:
batch_size=100
save_period=2
beta = 0.03
vae.beta.assign(beta)
numbatches = 3000

reduceLR = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=1e-4, cooldown=0, min_lr=0)
modelcheckpoint = keras.callbacks.ModelCheckpoint(train_output_dir + '/model_weights_{epoch:02d}_' + str(beta) + '.hdf5', save_freq = save_period*numbatches, save_weights_only=True)
reset_metrics_inst = reset_metrics()

callbacks=[tf.keras.callbacks.CSVLogger(train_output_dir + '/log.csv', separator=",", append=True),
            reduceLR,
            modelcheckpoint,
            reset_metrics_inst]




K.set_value(vae.optimizer.lr,1e-4)
epochs = 1000


history = vae.fit(x=train_x[:numbatches*batch_size], y=train_y[:numbatches*batch_size], batch_size=batch_size,
                epochs=epochs,verbose=1,#initial_epoch=int(vae.optimizer.iterations/numbatches),
                validation_data = (valid_x[:10*batch_size],valid_y[:10*batch_size]),
                callbacks = callbacks
              )

# tf.saved_model.save(vae, train_output_dir + '/mymodel.hdf5')

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
 456/3000 [===>..........................] - ETA: 1:28 - loss: 25.2957 - recon_loss: 0.0132 - KL loss: 10.6055 - beta: 0.0300

KeyboardInterrupt: 

In [18]:
batch_size=100
save_period=2
beta = 0.1
vae.beta.assign(beta)
numbatches = 3000

reduceLR = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=1e-4, cooldown=0, min_lr=0)
modelcheckpoint = keras.callbacks.ModelCheckpoint(train_output_dir + '/model_weights_{epoch:02d}_' + str(beta) + '.hdf5', save_freq = save_period*numbatches, save_weights_only=True)
reset_metrics_inst = reset_metrics()

callbacks=[tf.keras.callbacks.CSVLogger(train_output_dir + '/log.csv', separator=",", append=True),
            reduceLR,
            modelcheckpoint,
            reset_metrics_inst]




K.set_value(vae.optimizer.lr,1e-4)
epochs = 1000


my_history = vae.fit(x=train_x[:numbatches*batch_size], y=train_y[:numbatches*batch_size], batch_size=batch_size,
                epochs=epochs,verbose=1,#initial_epoch=int(vae.optimizer.iterations/numbatches),
                validation_data = (valid_x[:10*batch_size],valid_y[:10*batch_size]),
                callbacks = callbacks
              )

# tf.saved_model.save(vae, train_output_dir + '/mymodel.hdf5')

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
 238/3000 [=>............................] - ETA: 3:21 - loss: 9.1392 - recon_loss: 0.0271 - KL loss: 6.4277 - beta: 0.1000

KeyboardInterrupt: 

In [20]:
batch_size=100
save_period=2
beta = 0.3
vae.beta.assign(beta)
numbatches = 3000

reduceLR = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=1e-4, cooldown=0, min_lr=0)
modelcheckpoint = keras.callbacks.ModelCheckpoint(train_output_dir + '/model_weights_{epoch:02d}_' + str(beta) + '.hdf5', save_freq = save_period*numbatches, save_weights_only=True)
reset_metrics_inst = reset_metrics()

callbacks=[tf.keras.callbacks.CSVLogger(train_output_dir + '/log.csv', separator=",", append=True),
            reduceLR,
            modelcheckpoint,
            reset_metrics_inst]




K.set_value(vae.optimizer.lr,1e-4)
epochs = 1000

my_history = vae.fit(x=train_x[:numbatches*batch_size], y=train_y[:numbatches*batch_size], batch_size=batch_size,
                epochs=epochs,verbose=1,#initial_epoch=int(vae.optimizer.iterations/numbatches),
                validation_data = (valid_x[:10*batch_size],valid_y[:10*batch_size]),
                callbacks = callbacks,initial_epoch=7
              )

Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 00018: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
 646/3000 [=====>........................] - ETA: 2:48 - loss: 4.9963 - recon_loss: 0.1110 - KL loss: 3.7630 - beta: 0.3000

KeyboardInterrupt: 

In [21]:
my_history

NameError: name 'my_history' is not defined

In [22]:
batch_size=100
save_period=2
beta = 1.0
vae.beta.assign(beta)
numbatches = 3000

reduceLR = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=1e-4, cooldown=0, min_lr=0)
modelcheckpoint = keras.callbacks.ModelCheckpoint(train_output_dir + '/model_weights_{epoch:02d}_' + str(beta) + '.hdf5', save_freq = save_period*numbatches, save_weights_only=True)
reset_metrics_inst = reset_metrics()

callbacks=[tf.keras.callbacks.CSVLogger(train_output_dir + '/log.csv', separator=",", append=True),
            reduceLR,
            modelcheckpoint,
            reset_metrics_inst]




K.set_value(vae.optimizer.lr,1e-4)
epochs = 1000

my_history = vae.fit(x=train_x[:numbatches*batch_size], y=train_y[:numbatches*batch_size], batch_size=batch_size,
                epochs=epochs,verbose=1,#initial_epoch=int(vae.optimizer.iterations/numbatches),
                validation_data = (valid_x[:10*batch_size],valid_y[:10*batch_size]),
                callbacks = callbacks,initial_epoch=22
              )

Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 00026: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 00030: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000

KeyboardInterrupt: 

In [None]:
batch_size=100
save_period=2
beta = 1.5
vae.beta.assign(beta)
numbatches = 3000

reduceLR = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=1e-4, cooldown=0, min_lr=0)
modelcheckpoint = keras.callbacks.ModelCheckpoint(train_output_dir + '/model_weights_{epoch:02d}_' + str(beta) + '.hdf5', save_freq = save_period*numbatches, save_weights_only=True)
reset_metrics_inst = reset_metrics()

callbacks=[tf.keras.callbacks.CSVLogger(train_output_dir + '/log.csv', separator=",", append=True),
            reduceLR,
            modelcheckpoint,
            reset_metrics_inst]




K.set_value(vae.optimizer.lr,1e-4)
epochs = 1000

my_history = vae.fit(x=train_x[:numbatches*batch_size], y=train_y[:numbatches*batch_size], batch_size=batch_size,
                epochs=epochs,verbose=1,#initial_epoch=int(vae.optimizer.iterations/numbatches),
                validation_data = (valid_x[:10*batch_size],valid_y[:10*batch_size]),
                callbacks = callbacks,initial_epoch=33
              )

Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000