### Train Hierarchical Attention Network

- Joel Stremmel
- 04-19-23

##### About

Train a Hierarchical Attention Network (HAN) on the formatted data using K-Fold Cross-Validation and save the scores.

##### Imports

In [1]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

import tensorflow as tf
from transformers import TFBertModel
from tensorflow.keras.layers import Input, Dense, Concatenate, GlobalAveragePooling1D, Dropout, LSTM, Bidirectional, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

##### Set Parameters

In [2]:
max_seq_len = 1024
max_num_utterances = 32
batch_size = 32
accumulation_steps = 1
lr = 0.00002
weight_decay = 0.01
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_epsilon = 0.00000001
warmup_steps = 2
logging_steps = 1
num_workers = 8
seed = 44
epochs = 5
fp16 = True
output_dir = "lf_output"
lm_path = "kiddothe2b/longformer-mini-1024"

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [4]:
with open('data/X_folds.pkl', 'rb') as f:
    X_folds = pickle.load(f)

with open('data/y_folds.pkl', 'rb') as f:
    y_folds = pickle.load(f)

##### Check Data Shape

In [5]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [6]:
print(f"Target prevalance: {np.mean(np.concatenate(y))}.")

Target prevalance: 0.5277777777777778.


##### Check that GPU is Available

In [7]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

1.8.1+cu101


##### Tokenize Text and Fit Model to Each Fold

In [8]:
y_probs, y_trues = [], []
for i in range(len(X)):
    
    print(f"Fitting model using fold {i} as out of fold data.")
    
    # Identify train folds and shuffle samples
    X_train, y_train = np.concatenate(X[0:i] + X[i+1:], axis=0), np.concatenate(y[0:i] + y[i+1:], axis=0)
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    X_train, y_train = X_train[indices], y_train[indices]
    
    # Identify test folds
    X_test, y_test = X[i], y[i]
    
    # Format text and label data as HuggingFace dataset
    train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
    test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

    # Define the input layers
    full_text_input = Input(shape=(max_seq_len,), name='full_text_input')
    utterance_input = Input(shape=(max_num_utterances, max_seq_len), name='utterance_input')

    # Define the pre-trained transformer model (e.g., BERT, RoBERTa)
    pretrained_model = TFBertModel.from_pretrained('bert-base-uncased')
    for layer in pretrained_model.layers:
        layer.trainable = False

    # Define the hierarchical attention network architecture
    full_text_encoding = pretrained_model(full_text_input)[0]
    full_text_encoding = GlobalAveragePooling1D()(full_text_encoding)
    full_text_encoding = Dropout(0.2)(full_text_encoding)

    utterance_encodings = []
    for i in range(max_num_utterances):
        utterance_encoding = pretrained_model(utterance_input[:, i, :])[0]
        utterance_encoding = Bidirectional(LSTM(64, return_sequences=True))(utterance_encoding)
        attention_weights = Dense(1, activation='tanh')(utterance_encoding)
        attention_weights = tf.squeeze(attention_weights, axis=-1)
        attention_weights = tf.nn.softmax(attention_weights, axis=-1)
        utterance_encoding = tf.matmul(tf.transpose(attention_weights, [0, 2, 1]), utterance_encoding)
        utterance_encoding = LayerNormalization()(utterance_encoding)
        utterance_encoding = GlobalAveragePooling1D()(utterance_encoding)
        utterance_encoding = Dropout(0.2)(utterance_encoding)
        utterance_encodings.append(utterance_encoding)

    participant_output = Concatenate()(utterance_encodings)
    participant_output = Dense(128, activation='relu')(participant_output)
    participant_output = Dropout(0.2)(participant_output)
    participant_output = Dense(1, activation='sigmoid', name='participant_output')(participant_output)

    model = Model(inputs=[full_text_input, utterance_input], outputs=[participant_output])

    # Compile the model with a categorical cross-entropy loss function
    optimizer = Adam(lr=2e-5)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    model.fit(
        {'full_text_input': X_train_full, 'utterance_input': X_train_utterances},
        {'participant_output': y_train},
        validation_data=None,
        batch_size=32,
        epochs=10,
        callbacks=[]
    )

    # Predict on test dataset
    output = trainer.predict(test_dataset)
    labels = output.label_ids
    y_prob = model.predict({'full_text_input': X_test_full, 'utterance_input': X_test_utterances})

    # Save scores and labels
    y_probs.append(y_prob)
    y_trues.append(labels)

Fitting model using fold 0 as out of fold data.


Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'longformer.embeddings.position_ids', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

{'loss': 0.6833, 'learning_rate': 1e-05, 'epoch': 0.5}
{'loss': 0.6954, 'learning_rate': 2e-05, 'epoch': 1.0}
{'loss': 0.6782, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.5}
{'loss': 0.702, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.0}
{'loss': 0.681, 'learning_rate': 1.25e-05, 'epoch': 2.5}
{'loss': 0.6998, 'learning_rate': 1e-05, 'epoch': 3.0}
{'loss': 0.6804, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.5}
{'loss': 0.6678, 'learning_rate': 5e-06, 'epoch': 4.0}
{'loss': 0.6787, 'learning_rate': 2.5e-06, 'epoch': 4.5}
{'loss': 0.6697, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 45.8534, 'train_samples_per_second': 5.997, 'train_steps_per_second': 0.218, 'train_loss': 0.6836280763149262, 'epoch': 5.0}
Fitting model using fold 1 as out of fold data.


Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'longformer.embeddings.position_ids', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'loss': 0.688, 'learning_rate': 1e-05, 'epoch': 0.5}
{'loss': 0.697, 'learning_rate': 2e-05, 'epoch': 1.0}
{'loss': 0.6941, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.5}
{'loss': 0.6761, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.0}
{'loss': 0.6871, 'learning_rate': 1.25e-05, 'epoch': 2.5}
{'loss': 0.6819, 'learning_rate': 1e-05, 'epoch': 3.0}
{'loss': 0.6816, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.5}
{'loss': 0.6787, 'learning_rate': 5e-06, 'epoch': 4.0}
{'loss': 0.6838, 'learning_rate': 2.5e-06, 'epoch': 4.5}
{'loss': 0.6652, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 46.8505, 'train_samples_per_second': 6.083, 'train_steps_per_second': 0.213, 'train_loss': 0.6833590388298034, 'epoch': 5.0}
Fitting model using fold 2 as out of fold data.


Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'longformer.embeddings.position_ids', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'loss': 0.6886, 'learning_rate': 1e-05, 'epoch': 0.5}
{'loss': 0.6991, 'learning_rate': 2e-05, 'epoch': 1.0}
{'loss': 0.7014, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.5}
{'loss': 0.6754, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.0}
{'loss': 0.6921, 'learning_rate': 1.25e-05, 'epoch': 2.5}
{'loss': 0.6911, 'learning_rate': 1e-05, 'epoch': 3.0}
{'loss': 0.6815, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.5}
{'loss': 0.6825, 'learning_rate': 5e-06, 'epoch': 4.0}
{'loss': 0.6862, 'learning_rate': 2.5e-06, 'epoch': 4.5}
{'loss': 0.6774, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 46.9541, 'train_samples_per_second': 6.07, 'train_steps_per_second': 0.213, 'train_loss': 0.6875467717647552, 'epoch': 5.0}
Fitting model using fold 3 as out of fold data.


Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'longformer.embeddings.position_ids', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

{'loss': 0.698, 'learning_rate': 1e-05, 'epoch': 0.5}
{'loss': 0.6824, 'learning_rate': 2e-05, 'epoch': 1.0}
{'loss': 0.6982, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.5}
{'loss': 0.6812, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.0}
{'loss': 0.6981, 'learning_rate': 1.25e-05, 'epoch': 2.5}
{'loss': 0.6845, 'learning_rate': 1e-05, 'epoch': 3.0}
{'loss': 0.686, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.5}
{'loss': 0.6888, 'learning_rate': 5e-06, 'epoch': 4.0}
{'loss': 0.6715, 'learning_rate': 2.5e-06, 'epoch': 4.5}
{'loss': 0.6844, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 49.5067, 'train_samples_per_second': 6.06, 'train_steps_per_second': 0.202, 'train_loss': 0.6873256683349609, 'epoch': 5.0}
Fitting model using fold 4 as out of fold data.


Some weights of the model checkpoint at kiddothe2b/longformer-mini-1024 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'longformer.embeddings.position_ids', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at kiddothe2b/longformer-mini-1024 and are newly

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'loss': 0.6848, 'learning_rate': 1e-05, 'epoch': 0.5}
{'loss': 0.6852, 'learning_rate': 2e-05, 'epoch': 1.0}
{'loss': 0.6851, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.5}
{'loss': 0.6873, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.0}
{'loss': 0.6586, 'learning_rate': 1.25e-05, 'epoch': 2.5}
{'loss': 0.6888, 'learning_rate': 1e-05, 'epoch': 3.0}
{'loss': 0.6771, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.5}
{'loss': 0.6671, 'learning_rate': 5e-06, 'epoch': 4.0}
{'loss': 0.6821, 'learning_rate': 2.5e-06, 'epoch': 4.5}
{'loss': 0.679, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 48.7569, 'train_samples_per_second': 6.05, 'train_steps_per_second': 0.205, 'train_loss': 0.6795224547386169, 'epoch': 5.0}


##### Save Model Probabilities on Test Folds and True Labels

In [9]:
with open('results/lfm_y_trues.pkl', 'wb') as f:
    pickle.dump(y_trues, f)

with open('results/lfm_y_probs.pkl', 'wb') as f:
    pickle.dump(y_probs, f)