<a href="https://colab.research.google.com/github/ipeirotis/autoencoders_census/blob/main/missing_value_imputation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Autoencoders and Data Quality for Tabular Data**

In [None]:
!rm -rf autoencoders_census
!git clone https://github.com/ipeirotis/autoencoders_census.git
%cd autoencoders_census

Cloning into 'autoencoders_census'...
remote: Enumerating objects: 355, done.[K
remote: Counting objects: 100% (232/232), done.[K
remote: Compressing objects: 100% (160/160), done.[K
remote: Total 355 (delta 138), reused 123 (delta 72), pack-reused 123[K
Receiving objects: 100% (355/355), 37.09 MiB | 7.54 MiB/s, done.
Resolving deltas: 100% (204/204), done.
/content/autoencoders_census


In [None]:
from google.colab import auth

# Login using the account that has access to the Google project
# in order to access the resources for the project
auth.authenticate_user()

In [None]:
# This code imports the DataTransformer object from
# the notebook pandas2vector.ipynb
!pip install -q import_ipynb keras-tuner
import import_ipynb
from pandas2vector import Table2Vector
from data_loader import DataLoader
from autoencoder import AutoencoderModel

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25himporting Jupyter notebook from pandas2vector.ipynb
importing Jupyter notebook from data_loader.ipynb
importing Jupyter notebook from autoencoder.ipynb


# Data Source

The used data stems from the Youth Risk Behavior Surveillance System by CDC (Centers for Disease Control and Prevention). It includes data on a set of surveys that track behaviors that can lead to poor health in students grades 9 through 12. [Here](https://www.cdc.gov/healthyyouth/data/yrbs/data.htm) is the link to the dataset.

Note: The dataset is updated every two years. We use the 2017 version of the national high school YRBS dataset.

## Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix
from pandas.api.types import is_numeric_dtype
from matplotlib.offsetbox import TransformedBbox

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.interpolate import interp1d
from numpy.random import uniform
from numpy.random import seed

In [None]:
from keras.layers import Input
from keras.layers import Dense
from keras.layers import BatchNormalization
from keras import backend as K
from keras import Model

In [None]:
import tensorflow as tf
from tensorflow.random import set_seed
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow import keras

# Load Original Dataset

In [None]:

# Data Source
data_loader = DataLoader()

project_data, variable_types  = data_loader.load_2017()



In [None]:
project_data.shape

(14765, 98)

In [None]:
project_data.dropna().shape

(14765, 98)

# Vectorizing the data set

In [None]:
# Data Transformation
vectorizer = Table2Vector(variable_types)


# Without indicator variables for missingness
vectorized_df = vectorizer.vectorize_table(project_data)

In [None]:
assert( vectorizer.tabularize_vector(vectorized_df).shape == project_data.shape )

# Autoencoder loading

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import custom_object_scope
import gcsfs




# Define the path to the file on GCS
gcs_model_path = "gs://autoencoder_census_models/best_parameters_autoencoder.h5"

# Define the local path where the model will be downloaded
local_model_path = "best_parameters_autoencoder.h5"

# Create a GCS filesystem object using your project ID
fs = gcsfs.GCSFileSystem(project='autoencoder-census')

# Download the model file from GCS
with fs.open(gcs_model_path, 'rb') as gcs_file:
    with open(local_model_path, 'wb') as local_file:
        local_file.write(gcs_file.read())


# Initialize AutoencoderModel with attribute_cardinalities
# to use the custom loss function
attr_cardinalities = list(project_data.describe().T['unique'].values)
autoencoder_model = AutoencoderModel(attr_cardinalities)


# Load the model from the local file
with custom_object_scope({'custom_categorical_crossentropy': autoencoder_model.custom_categorical_crossentropy}):
  model = load_model(local_model_path)




## Making Predictions and Filling in the Missing Values


In [None]:
import numpy as np

# Fill the missing values with some initial guess. Here, the median of each column is used.
filled = vectorized_df.fillna(vectorized_df.median())

# Define a condition for stopping the iteration
epsilon = 1e-5
old_filled = None

while old_filled is None or np.abs(filled - old_filled).sum().sum() > epsilon:
  # Save the old filled DataFrame for convergence check
  old_filled = filled.copy()

  # Run the data through the autoencoder, which will return a complete version of the data.
  predicted = model.predict(filled)

  # Replace the initially guessed values in the original data with the corresponding values from the autoencoder's output. But keep the observed values unchanged.
  mask = vectorized_df.isna()
  filled[mask] = np.where(mask, predicted, filled)




In [None]:
filled

Unnamed: 0,age__12 years old or younger,age__13 years old,age__14 years old,age__15 years old,age__16 years old,age__17 years old,age__18 years old or older,age__nan,sex__Female,sex__Male,...,stheight_cat__top-1%,stweight_cat__missing,stweight_cat__normal,stweight_cat__top-1%,bmi_cat__bottom-1%,bmi_cat__missing,bmi_cat__normal,bmi_cat__top-1%,bmipct_cat__missing,bmipct_cat__normal
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14760,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
14761,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
14762,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
14763,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


Now that we have filled in the original dataset with the most likely values for their missing values, we run the Autoencoder on the filled vectorized dataset.

In [None]:
predicted = pd.DataFrame(model.predict(filled))
predicted.columns = filled.columns
predicted



Unnamed: 0,age__12 years old or younger,age__13 years old,age__14 years old,age__15 years old,age__16 years old,age__17 years old,age__18 years old or older,age__nan,sex__Female,sex__Male,...,stheight_cat__top-1%,stweight_cat__missing,stweight_cat__normal,stweight_cat__top-1%,bmi_cat__bottom-1%,bmi_cat__missing,bmi_cat__normal,bmi_cat__top-1%,bmipct_cat__missing,bmipct_cat__normal
0,0.008854,0.000914,0.053996,0.170452,0.253586,0.304417,0.168781,0.039001,0.470622,0.462397,...,0.000246,0.959625,0.035834,0.004541,0.000376,0.936705,0.057410,0.005509,0.963258,0.036742
1,0.012965,0.002962,0.159671,0.259041,0.247787,0.181032,0.101714,0.034827,0.623683,0.315528,...,0.000220,0.955455,0.040486,0.004058,0.000456,0.926331,0.065801,0.007413,0.958311,0.041689
2,0.016545,0.001026,0.051786,0.151486,0.251950,0.295202,0.175920,0.056084,0.519820,0.391533,...,0.000083,0.992081,0.006649,0.001269,0.000300,0.983821,0.013776,0.002103,0.993168,0.006832
3,0.022097,0.000936,0.042158,0.132901,0.240201,0.303495,0.187389,0.070823,0.486585,0.405377,...,0.000047,0.996512,0.002858,0.000630,0.000250,0.992107,0.006538,0.001105,0.997059,0.002941
4,0.009249,0.006250,0.255473,0.321879,0.192143,0.124668,0.069279,0.021059,0.487928,0.479467,...,0.000526,0.872751,0.121860,0.005389,0.000498,0.830045,0.160965,0.008492,0.873188,0.126812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14760,0.440979,0.002338,0.029040,0.086511,0.098537,0.184884,0.106180,0.051531,0.200075,0.566763,...,0.000236,0.994159,0.004860,0.000980,0.010492,0.971495,0.015340,0.002673,0.994528,0.005472
14761,0.419166,0.003395,0.034914,0.091447,0.105248,0.211215,0.088094,0.046521,0.194265,0.507922,...,0.000160,0.996363,0.002976,0.000661,0.017377,0.968620,0.011839,0.002164,0.996454,0.003547
14762,0.377622,0.002574,0.039877,0.108457,0.111794,0.207346,0.116109,0.036221,0.180140,0.658709,...,0.000993,0.972230,0.023858,0.003912,0.013899,0.917893,0.059903,0.008305,0.972714,0.027286
14763,0.005556,0.002863,0.163504,0.245987,0.257230,0.264514,0.058919,0.001427,0.560637,0.435741,...,0.000623,0.004186,0.981111,0.014703,0.000266,0.003915,0.986089,0.009731,0.003934,0.996066


In [None]:
tabular_from_predicted = vectorizer.tabularize_vector(predicted)