<a href="https://colab.research.google.com/github/ipeirotis/autoencoders_census/blob/main/hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Autoencoders and Data Quality for Tabular Data**

This notebook searches the parameter space to create the best possible architecture for our autoencoder and saves the file into a Google Storage Bucket.

In [1]:
!rm -rf autoencoders_census
!git clone https://github.com/ipeirotis/autoencoders_census.git
%cd autoencoders_census

Cloning into 'autoencoders_census'...
remote: Enumerating objects: 220, done.[K
remote: Counting objects: 100% (97/97), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 220 (delta 57), reused 0 (delta 0), pack-reused 123[K
Receiving objects: 100% (220/220), 7.38 MiB | 11.07 MiB/s, done.
Resolving deltas: 100% (123/123), done.
/content/autoencoders_census


In [2]:
from google.colab import auth

# Login using the account that has access to the Google project
# in order to access the resources for the project
auth.authenticate_user()

In [3]:
!pip install -q import_ipynb keras-tuner
import import_ipynb
from pandas2vector import Table2Vector
from data_loader import DataLoader
from autoencoder import AutoencoderModel

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25himporting Jupyter notebook from pandas2vector.ipynb
importing Jupyter notebook from data_loader.ipynb
importing Jupyter notebook from autoencoder.ipynb


# Data Source

The used data stems from the Youth Risk Behavior Surveillance System by CDC (Centers for Disease Control and Prevention). It includes data on a set of surveys that track behaviors that can lead to poor health in students grades 9 through 12. [Here](https://www.cdc.gov/healthyyouth/data/yrbs/data.htm) is the link to the dataset.

Note: The dataset is updated every two years. We use the 2017 version of the national high school YRBS dataset.

## Libraries

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

# Load Original Dataset

In [5]:
# Data Source
data_loader = DataLoader()
original_df = data_loader.load_original_data()
project_data, variable_types = data_loader.prepare_original_dataset(original_df)

In [6]:
original_df.shape

(14765, 305)

In [7]:
project_data.shape

(14765, 108)

# Transforming the original data set

In [8]:
# Data Transformation
vectorizer = Table2Vector(variable_types)


# Without indicator variables for missingness
vectorized_df = vectorizer.vectorize_table(project_data)

In [9]:
vectorized_df.shape

(14765, 563)

In [10]:
vectorizer.tabularize_vector(vectorized_df).shape

(14765, 108)

# Autoencoder training

## Hyperparameter search and model training

The training of an autoencoder model while effectively handling missing data with mask variables.

In [11]:
# Instantiate the class
model = AutoencoderModel()
model.EXECUTIONS_PER_TRIAL=5
model.EPOCHS=50
model.MAX_TRIALS=100

# Preprocess the data
X_train, X_test = model.split_train_test(vectorized_df)

In [None]:
# Define the tuner
tuner = model.define_tuner()

# Perform hyperparameter search
tuner.search(X_train, X_train, epochs=model.EPOCHS,
             # batch_size=model.BATCH_SIZE,
             validation_data=(X_test, X_test))

# Get the best hyperparameters and build the final model
best_hps = tuner.get_best_hyperparameters()[0]
final_model = model.build_autoencoder(best_hps)

Trial 1 Complete [00h 15m 10s]
val_loss: 0.060883885622024535

Best val_loss So Far: 0.060883885622024535
Total elapsed time: 00h 15m 10s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
0.001             |0.001             |learning_rate
32                |16                |batch_size
128               |112               |encoder_units_1
32                |48                |encoder_units_2
256               |240               |decoder_units_1
160               |96                |decoder_units_2

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch

In [None]:
# Train the final model
history = final_model.fit(X_train, X_train,
                           epochs=10*model.EPOCHS,
                           verbose=1,
                           validation_data=(X_test, X_test))

Visualize the training and validation loss values over epochs.

In [None]:
def model_analysis(train_loss, val_loss):
    epochs = range(1, len(train_loss) + 1)

    plt.figure(figsize=(8, 6))
    plt.title("Mean squared error")
    sns.lineplot(x=epochs, y=train_loss, label="Train", linewidth=3)
    sns.lineplot(x=epochs, y=val_loss, label="Validation", linewidth=3)
    plt.xlabel("Epochs")

    plt.legend()
    plt.show()

    print(f"Training MSE = {np.sqrt(train_loss[-1])}")
    print(f"Validation MSE = {np.sqrt(val_loss[-1])}")

model_analysis(history.history['loss'], history.history['val_loss'])

In [None]:
import gcsfs
# Create a GCS filesystem object using your project ID
fs = gcsfs.GCSFileSystem(project='autoencoder_census')

# Save the model locally
model_filename = "best_parameters_autoencoder.h5"
final_model.save(model_filename)



# Define the path to the file on GCS
gcs_model_path = "gs://autoencoder_census_models/best_parameters_autoencoder.h5"

# Open the local file in binary mode and upload its content to GCS
with open(model_filename, 'rb') as local_file:
    with fs.open(gcs_model_path, 'wb') as gcs_file:
        gcs_file.write(local_file.read())


In [None]:


# Store the hyperparameters and evaluation metrics in a dictionary
hyperparameters_dict = {"learning_rate": best_hps.get('learning_rate'),
                        "batch_size": best_hps.get('batch_size'),
                        "num_epochs": 10,
                        "loss": history.history['loss'][-1],
                        "val_loss": history.history['val_loss'][-1]}