Name: Hoda Akl <br> 

Note: Preprocessing the data is key to my result. I scale the data before training, the test data will also be scaled the same way. In order to do the scaling, the x_train data should be loaded in this notebook, there is also an option of loading the scaler. 

I have included in the email the model file, please put it in the same directory as this notebook.

Below you will find 

Part 1: The Model architecture

Part 2: Two functions with loaded weights to be used to the evaluation metric. 

Part 3: The reconstruction error on the test data set (0.0022086372084942126)

--------------------------------------------------------------------------------------------------
Notes on my scaling : 

1. first I denoise the data by imposing the threshold of 0.9 above which I consider all values to be one 
2. Second, on the denoised data I import StandardScaler from Sklearn preprosessing. 

To keep this notebook concise I do not do not define the scaler here from sklearn but I load it. The file is provided in the email. 

In [1]:
## importing packages and defining the reconstruction function 
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
####
from tensorflow import keras
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import models 
from tensorflow.keras import layers 
from tensorflow.keras.models import load_model
#
from sklearn.preprocessing import MinMaxScaler
import os


from pickle import load

def reconstr_loss(original_spectra, reconstructed_spectra, latent_dim):
    '''Function to calculate reconstruction loss.

    Inputs:
    - original_spectra (np.array): original spectra.
    - reconstructed_spectra (np.array): reconstruction of the original spectra from the latent representation.
    - latent_dim (integer): size of the latent space.

    Returns:
    - reconstruction loss with added penalty for the latent space size
    '''

    penalty = 0.00003
    penalty2 = 5*0.00003


    mse_loss = mean_squared_error(original_spectra, reconstructed_spectra, squared=True)
    loss_penalized = mse_loss + latent_dim*penalty + penalty2*(latent_dim > 6)

    return(loss_penalized)


In [2]:
#loading the data 

# load the data to define some dimensions

# download if data is not downloaded
# url_train = 'https://drive.google.com/u/0/uc?export=download&confirm=F_-k&id=1sdx-m9PLLKjPQ8J2g7H2zw2FUSMV6jYz'
# url_val = 'https://drive.google.com/u/0/uc?export=download&confirm=QL45&id=1qymhB00l4wy_Ql4A3DRxguxytFzda7_0'
# url_test = 'https://drive.google.com/u/0/uc?export=download&confirm=nAB1&id=1lhJl_6lWCxNpOSxJd9_d_qssgUKEli7M'

# # Using this links may be faster (but can fail if many users are connected)
# # url_train = 'https://drive.google.com/uc?id=1sdx-m9PLLKjPQ8J2g7H2zw2FUSMV6jYz'
# # url_val = 'https://drive.google.com/uc?id=1qymhB00l4wy_Ql4A3DRxguxytFzda7_0'
# # url_test = 'https://drive.google.com/uc?id=1lhJl_6lWCxNpOSxJd9_d_qssgUKEli7M'


# data_train = gdown.download(url_train, 'train_set.hdf5', quiet=False)
# data_val = gdown.download(url_val, 'val_set.hdf5', quiet=False)
# data_test = gdown.download(url_test, 'test_set.hdf5', quiet=False)

data_train = h5py.File( 'train_set.hdf5', 'r')
x_train = np.array( data_train['spectra'] )

data_val = h5py.File( 'val_set.hdf5', 'r')
x_val = np.array( data_val['spectra'] )

data_test = h5py.File('test_set.hdf5', 'r')
x_test = np.array( data_test['spectra'] )

### preprocess and define the Scaler 
from sklearn.preprocessing import StandardScaler


Scaler = StandardScaler()
x_Train_den = x_train.copy()
xidx, yidx = np.where(x_Train_den>.9)
x_Train_den[xidx, yidx] = np.ones(len(xidx))
Scaler = StandardScaler()
x_data_sc = Scaler.fit_transform(x_Train_den)



Part 1: The model 

In [3]:
# model functions 
def make_encoder(hidden_nodes_list, activation_functions_list):

    if len(hidden_nodes_list)!= len(activation_functions_list):
        raise ValueError("length of hidden nodes list should be equal length of activation_functions_list")

    nLayers = len(hidden_nodes_list)
    encoder = models.Sequential()
    for i in range(nLayers): 

        act = activation_functions_list[i]
        n_nodes = hidden_nodes_list[i]
        if i==0:
            encoder.add(layers.Dense(n_nodes,activation=act, input_shape=(input_dim,)))
        else: 
            encoder.add(layers.Dense(n_nodes,activation=act))

        latent_dim = hidden_nodes_list[-1]
    return encoder , latent_dim

def make_decoder(hidden_nodes_list, activation_functions_list, latent_dim):
    if len(hidden_nodes_list)!= len(activation_functions_list):
        raise ValueError("length of hidden nodes list should be equal length of activation_functions_list")
    nLayers = len(hidden_nodes_list)
    decoder = models.Sequential()
    for i in range(nLayers): 

        act = activation_functions_list[i]
        n_nodes = hidden_nodes_list[i]
        if i==0:
            decoder.add(layers.Dense(n_nodes, activation=act, input_shape=(latent_dim,)))
        else: 
            decoder.add(layers.Dense(n_nodes,activation=act))
    return decoder 



If you would like to run the fitting, run the following two cells. Make sure to have the file StandardScaler.pkl in the same directory

In [4]:
# from pickle import load

from sklearn.preprocessing import StandardScaler


# Scaler = StandardScaler()
# x_Train_den = x_train.copy()
# xidx, yidx = np.where(x_Train_den>.9)
# x_Train_den[xidx, yidx] = np.ones(len(xidx))
# Scaler = StandardScaler()
# x_data_sc = Scaler.fit_transform(x_Train_den)
def Preprocess_fn(data, Scaler = Scaler ): 
    # denoise the data 
    data_den = data.copy()
    xidx, yidx = np.where(data_den>.9)
    data_den[xidx, yidx] = np.ones(len(xidx))
    data_trans = Scaler.transform(data_den)
    
    return data_trans

def DeProcess_fn(recons, Scaler = Scaler ):


    Reconstructions = Scaler.inverse_transform(recons)
    
    return Reconstructions

In [5]:

input_dim = x_train.shape[1]


hidden_nodes = [264,128,6]
hidden_nodes_dec = [128,264,input_dim]

activation_e1= [ 'linear','relu','linear']#,'tanh','tanh','tanh']

activation_d1= [ 'relu','relu','linear']#,'tanh','tanh','tanh']

model_path = 'mcheckp_2.h5'
encoder,ld = make_encoder(hidden_nodes, activation_e1)
decoder = make_decoder(hidden_nodes_dec,activation_d1, latent_dim = ld)
network=models.Sequential()
network.add(encoder)
network.add(decoder)
network.compile(optimizer='adam', loss='mse', metrics=['mse'])
network.summary()




Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential (Sequential)      (None, 6)                 10588094  
_________________________________________________________________
sequential_1 (Sequential)    (None, 39974)             10628062  
Total params: 21,216,156
Trainable params: 21,216,156
Non-trainable params: 0
_________________________________________________________________


2021-11-30 18:28:00.020674: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-30 18:28:00.020725: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-30 18:28:00.020756: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (c23b-s36.ufhpc): /proc/driver/nvidia/version does not exist
2021-11-30 18:28:00.021201: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# uncomment the following line if you wish to run 
x_tr_pp = Preprocess_fn(x_train)
x_val_pp = Preprocess_fn(x_val)
keras_callbacks   = [
      EarlyStopping(monitor='val_loss', patience=10),
      ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True)
]

### UNCOMMENT THE FOLLOWING LINE FOR FITTING
epochs = 100 
batch_size = 128 


history = network.fit(x_tr_pp,x_tr_pp,
              epochs=epochs,
              batch_size=batch_size,
              callbacks=keras_callbacks, # Early stopping
              validation_data=(x_val_pp,x_val_pp))

2021-11-30 18:28:14.919957: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100


Part 2: Functions to get to the latents and the reconstructions 

Those functions use the scaler file and the model file accompanied on the same email. 
Please download them to the same directory. 


In [7]:
### preprocess function 
import os

# if you can not load the scaler here is how to define it 
### UNcomment the following lines 


# from sklearn.preprocessing import StandardScaler


# Scaler = StandardScaler()
# x_Train_den = x_train.copy()
# xidx, yidx = np.where(x_Train_den>.9)
# x_Train_den[xidx, yidx] = np.ones(len(xidx))
# Scaler = StandardScaler()
# x_data_sc = Scaler.fit_transform(x_Train_den)



# from pickle import load
def Preprocess_fn(data, Scaler = Scaler ): 
    # denoise the data 
    data_den = data.copy()
    xidx, yidx = np.where(data_den>.9)
    data_den[xidx, yidx] = np.ones(len(xidx))
    data_trans = Scaler.transform(data_den)
    
    return data_trans

def DeProcess_fn(recons, Scaler = Scaler ):


    Reconstructions = Scaler.inverse_transform(recons)
    
    return Reconstructions
    

def reduce_dimensionality(data, model_path = model_path): 
    
    data_pp = Preprocess_fn(data)
    ## load the encoder from the saved model 
    if os.path.exists(model_path) != True: 
        raise ValueError("Model path does not exists, please make sure it is in the right place") 
    mod = load_model(model_path)
    encoder = mod.get_layer(index=0)
    ## get latents
    test_set_latent_encoding = encoder(data_pp)
    return test_set_latent_encoding
    
    

def calculate_reconstructions(latents, model_path = model_path): 
    if os.path.exists(model_path) != True: 
        raise ValueError("Model path does not exists, please make sure it is in the right place") 
    mod = load_model(model_path)
    decoder = mod.get_layer(index=1)
    decodings = decoder(latents)
    # revert back - inverse scaling 
    recons = DeProcess_fn(decodings)
    
    return recons 

Part 3: Test set reconstruction loss 

In [8]:
lats = reduce_dimensionality(x_test, model_path = model_path)

In [9]:
test_set_reconstructions = calculate_reconstructions(lats, model_path = model_path)

In [10]:
reconstr_loss(x_test, test_set_reconstructions, latent_dim = lats.shape[1])

0.0022919134355735547