# Autoencoder

The following file contains the code for training the autoencoder and generating (and saving) the sentence embeddings. For generating the general sentence embeddings, the autoencoder is trained on all the data and for generating the specific sentence embeddings the autoencoder is trained on a chosen domain. In this work, for the specific sentence embeddings an additional cell is simply executed, this cell is highlighted by the "CAUTION" comment. Besides, the batch_size for general sentence embeddings was set to 32 and for specific sentence embeddings to 16 as commented in the corresponding cell.

Create the directories "weights/autoencoder/general/", "data/sentence_embeddings/general/unsorted/sentemb/" and "data/sentence_embeddings/general/unsorted/label_domain/" for saving the weights and sentence embeddings when training the autoencoder on all the data. For training the autoencoder on data of one domain exclusively create the directories "weights/autoencoder/specific/", "data/sentence_embeddings/specific/sentemb/" and "data/sentence_embeddings/specific/label_domain/".

## Importing libraries and setting configurations

In [None]:
from keras_self_attention import SeqSelfAttention
import os
import h5py
import numpy as np
import pandas as pd
import random as rn
import pickle as pkl
import tensorflow as tf
import sys

In [None]:
SEQUENCE_LEN = 50
EMBED_SIZE = 300
LATENT_SIZE = 300
encoding_dim = 100

def load_data_from_file(filename):
    """Load data from a file."""
    with open(filename, 'rb') as f:
        return pkl.load(f)

def shuffle_data(data, labels, seed):
    """Shuffle data and labels."""
    idx = np.random.RandomState(seed=seed).permutation(data.shape[0])
    return data[idx], labels[:, idx]

def filter_data_by_domain(X, label_domain, domain):
    """Filter data by domain."""
    index_domain = [i for i, e in enumerate(label_domain[1]) if e == domain]
    return X[index_domain], label_domain[:, index_domain]

# Build autoencoder
def build_autoencoder():
    # encoder
    inp = tf.keras.Input(shape=(SEQUENCE_LEN, EMBED_SIZE))
    enc_out1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=LATENT_SIZE, input_shape=(SEQUENCE_LEN, EMBED_SIZE), return_sequences=True))(inp)
    inp_att, attn_weights = SeqSelfAttention(return_attention=True)(enc_out1)
    enc_out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=300, input_shape=(SEQUENCE_LEN, EMBED_SIZE)), merge_mode='sum')(inp_att)

    # encoder model (to extract sentence embeddings later)
    encoder_model = tf.keras.Model(inputs=inp, outputs=enc_out)

    rep_vec = tf.keras.layers.RepeatVector(SEQUENCE_LEN)(enc_out)

    # decoder
    dec_lstm_out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=300, return_sequences=True), merge_mode='sum')(rep_vec)
    dec_dense_out = tf.keras.layers.Dense(EMBED_SIZE)(dec_lstm_out)

    # entire autoencoder model
    autoencoder = tf.keras.Model(inp, dec_dense_out)
    return autoencoder

def remove_unlabeled_entries(embeddings, label_domain_data):
    """
    Remove unlabeled entries from the embeddings and label_domain_data.
    """
    df = pd.DataFrame(label_domain_data.transpose(), columns=['label', 'domain', 'idx_domain'])
    list_unlabel = df.index[df['label'] == 3].tolist()

    # Delete the rows with label=3 (unlabeled)
    df_filtered = df[~df.index.isin(list_unlabel)].reset_index(drop=True)
    label_domain_filtered = df_filtered.to_numpy().transpose()

    df_embeddings = pd.DataFrame(embeddings)
    df_embeddings_filtered = df_embeddings[~df_embeddings.index.isin(list_unlabel)].reset_index(drop=True)
    embeddings_filtered = df_embeddings_filtered.to_numpy().transpose()

    return embeddings_filtered, label_domain_filtered

def save_embeddings(embedding_type, embeddings, label_domain_merged, label_domain_test):
    """
    Save the embeddings and label domain data based on the embedding type.
    """
    if embedding_type == 'GENERAL':
        path_prefix = "data/sentence_embeddings/general/unsorted"
    else:  # SPECIFIC
        path_prefix = "data/sentence_embeddings/specific"

    pkl.dump(embeddings, open(f"{path_prefix}/sentemb/sentemb_unlabeled14.p", "wb"))
    pkl.dump(label_domain_merged, open(f"{path_prefix}/label_domain/label_domain_train_sentemb_unlabeled14.p", "wb"))
    pkl.dump(label_domain_test, open(f"{path_prefix}/label_domain/label_domain_test_sentemb_unlabeled14.p", "wb"))

def process_and_save_embeddings(encoder_model, data, label_domain_merged, label_domain_test, embedding_type):
    # Use encoder to generate sentence embeddings
    sentence_embeddings = encoder_model.predict(data)

    # Create DataFrame from label_domain_merged
    df_labels = pd.DataFrame(label_domain_merged.transpose(), columns=['label', 'domain', 'idx_domain'])

    # Identify and filter out unlabeled rows
    list_unlabel = df_labels.index[df_labels['label'] == 3].to_list()
    df_labels_filtered = df_labels[~df_labels.index.isin(list_unlabel)].reset_index(drop=True)
    df_embeddings_filtered = pd.DataFrame(sentence_embeddings)[~pd.DataFrame(sentence_embeddings).index.isin(list_unlabel)].reset_index(drop=True)

    # Convert DataFrames back to numpy arrays
    filtered_labels = df_labels_filtered.to_numpy().transpose()
    filtered_embeddings = df_embeddings_filtered.to_numpy().transpose()

    # Determine save paths based on embedding type
    if embedding_type == 'GENERAL':
        base_path = "data/sentence_embeddings/general/unsorted/"
    else:  # SPECIFIC
        base_path = "data/sentence_embeddings/specific/unsorted/"

    # Save the processed embeddings and labels
    pkl.dump(filtered_embeddings, open(os.path.join(base_path, f"sentemb/sentemb_unlabeled14.p"), "wb"))
    pkl.dump(filtered_labels, open(os.path.join(base_path, f"label_domain/label_domain_train_sentemb_unlabeled14.p"), "wb"))
    pkl.dump(label_domain_test, open(os.path.join(base_path, f"label_domain/label_domain_test_sentemb_unlabeled14.p"), "wb"))

    return filtered_embeddings, filtered_labels

def main():
    # Load data
    label_domain_test = load_data_from_file('domain_and_label_test14.p')
    label_domain_merged = load_data_from_file('domain_and_label_merged14.p')

    # Shuffle data
    X_merged, label_domain_merged = shuffle_data(X_merged, label_domain_merged, seed=42)
    X_test, label_domain_test = shuffle_data(X_test, label_domain_test, seed=43)

    # Check environment variable and decide training type
    embedding_type = os.environ.get('EMBEDDING_TYPE', 'GENERAL')  # default is 'GENERAL'
    domain = int(os.environ.get('DOMAIN_NUMBER', 5))  # default is 5

    # Filter data based on domain if embedding type is SPECIFIC
    if embedding_type == 'SPECIFIC':
        X_merged, label_domain_merged = filter_data_by_domain(X_merged, label_domain_merged, domain)
        X_test, label_domain_test = filter_data_by_domain(X_test, label_domain_test, domain)

    # Concatenate train and test data
    data = np.concatenate([X_merged, X_test])

    # Build and compile the autoencoder model
    autoencoder = build_autoencoder()
    autoencoder.summary()
    autoencoder.compile(optimizer='adam', loss='mse')

    # Callbacks
    es = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
    
    # Set paths and batch sizes depending on embedding type
    if embedding_type == 'GENERAL':
        checkpoint_path = "weights/autoencoder/general/autoencoder_weights_with_unlabeled14.h5"
        batch_size = 32
    else:  # SPECIFIC
        checkpoint_path = "weights/autoencoder/specific/autoencoder_weights_with_unlabeled4_5.h5"
        batch_size = 16
    checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path)

    # Train the model
    history = autoencoder.fit(data, data, epochs=25, callbacks=[checkpoint, es], batch_size=batch_size)

    # Check environment variables
    embedding_type = os.environ.get('EMBEDDING_TYPE', 'GENERAL')  # default is 'GENERAL'

    # Process and save based on the embedding type
    processed_embeddings, processed_labels = process_and_save_embeddings(encoder_model, data, label_domain_merged, label_domain_test, embedding_type)



if __name__ == "__main__":
    main()



In [8]:
# use the encoder to generate the sentence embeddings
sentence_embeddings_gen = encoder_model.predict(data)

In [439]:
# use the encoder to generate the sentence embeddings
sentence_embeddings_spec = encoder_model.predict(data)

In [332]:
sentence_embeddings_gen.shape

(62224, 300)

In [426]:
sentence_embeddings_spec.shape

(300, 2000)

In [79]:
label_domain_merged.shape

(3, 1599)

# Process for the general Embeddings

In [10]:

import pandas as pd
df = pd.DataFrame(label_domain_merged.transpose(), columns = ['label','domain','idx_domain'])

df_merged = pd.DataFrame(sentence_embeddings_gen)

df = pd.DataFrame(label_domain_merged.transpose(), columns = ['label','domain','idx_domain'])

list_unlabel = df.index[df['label'] == 3].to_list()


#Delete the rows with label=3 (unlabeled) in the label_merged
df_label_merged = df[~df.index.isin(list_unlabel)].reset_index(drop=True)

array = df_label_merged.to_numpy()
label_domain_merged = array.transpose()

df_merged2 = df_merged[~df_merged.index.isin(list_unlabel)].reset_index(drop=True)

array2 = df_merged2.to_numpy()
sentence_embeddings = array2.transpose()

In [518]:
sentence_embeddings.shape

(300, 31780)

# Process for Specific Embeddings

In [440]:
import pandas as pd
df_labels_spec = pd.DataFrame(label_domain_merged.transpose(), columns = ['label','domain','idx_domain'])

df_merged_spec = pd.DataFrame(sentence_embeddings_spec)

list_unlabel_spec = df_labels_spec.index[df_labels_spec['label'] == 3].to_list()

df_labels_spec2 = df_labels_spec[~df_labels_spec.index.isin(list_unlabel_spec)].reset_index(drop=True)

array = df_labels_spec2.to_numpy()
label_domain_merged = array.transpose()

df_merged_spec2 = df_merged_spec[~df_merged_spec.index.isin(list_unlabel_spec)].reset_index(drop=True)

array2 = df_merged_spec2.to_numpy()
sentence_embeddings_spec = array2.transpose()

General Sentences Embeddings

In [11]:
# save the sentence embeddings as well as the shuffled labels
# here the general embeddings were saved in the general embeddings folder
# change directories to "data/sentence_embeddings/specific/sentemb/" and "data/sentence_embeddings/specific/label_domain/" for specific sentence embeddings
pkl.dump(sentence_embeddings, open("data/sentence_embeddings/general/unsorted/sentemb/sentemb_unlabeled14.p", "wb"))
pkl.dump(label_domain_merged, open("data/sentence_embeddings/general/unsorted/label_domain/label_domain_train_sentemb_unlabeled14.p", "wb"))
pkl.dump(label_domain_test, open("data/sentence_embeddings/general/unsorted/label_domain/label_domain_test_sentemb_unlabeled14.p", "wb"))




Specific Sentences Embeddings

In [441]:
# save the sentence embeddings as well as the shuffled labels
# here the general embeddings were saved in the general embeddings folder
# change directories to "data/sentence_embeddings/specific/sentemb/" and "data/sentence_embeddings/specific/label_domain/" for specific sentence embeddings
pkl.dump(sentence_embeddings_spec, open("data/sentence_embeddings/specific/sentemb/sentemb_unlabeled4_5.p", "wb"))
pkl.dump(label_domain_merged, open("data/sentence_embeddings/specific/label_domain/label_domain_train_sentemb_unlabeled4_5.p", "wb"))
pkl.dump(label_domain_test, open("data/sentence_embeddings/specific/label_domain/label_domain_test_sentemb_unlabeled4_5.p", "wb"))

## Feeding preprocessed data into an already trained Autoencoder to predict sentence embeddings

In [335]:
# general sentence embeddings
# data needs to be the full data without the cell on top of this file being executed for selecting a domain
autoencoder.load_weights('weights/autoencoder/general/autoencoder_weights_with_unlabeled4.h5')
sentence_embeddings_general = encoder_model.predict(data)

In [348]:
# specific sentence embeddings
# data needs to be of only one domain, the cell for selecting the domain on top of the file needs to be executed
# here domain 0 was chosen (MR) as an example
autoencoder.load_weights('weights/autoencoder/specific/autoencoder_weights_with_unlabeled4_1.h5')
#sentence_embeddings_specific_1 = encoder_model.predict(data)

In [406]:
sentence_embeddings_general.shape

(62169, 300)

In [405]:
sentence_embeddings_spec.shape

(300, 1970)