### Train Hierarchical Model

- Joel Stremmel
- 04-19-23

##### About

Train a Hierarchical Model on the formatted data using K-Fold Cross-Validation and save the scores.

##### Imports

In [1]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
import tensorflow as tf
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from transformers import RobertaTokenizer, TFRobertaModel

2023-04-19 14:38:55.764242: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-19 14:38:55.829210: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-19 14:38:55.829874: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


##### Set Parameters

In [2]:
max_utterance_length = 64
max_num_utterances = 32
batch_size = 32
lr = 0.00002
epochs = 5
output_dir = "lf_output"
lm_path = "roberta-base"
model_key = "lstmh"
input_dir = "./data"
results_dir = "./results"

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [4]:
with open(os.path.join(input_dir, "X_folds.pkl"), "rb") as f:
    X_folds = pickle.load(f)

with open(os.path.join(input_dir, "y_folds.pkl"), "rb") as f:
    y_folds = pickle.load(f)

##### Check Data Shape

In [5]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [6]:
print(f"Target prevalance: {np.mean(np.concatenate(y))}.")

Target prevalance: 0.5277777777777778.


##### Check that GPU is Available

In [7]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

1.8.1+cu101


##### Load Pretrained Encoder and Tokenizer

In [8]:
tokenizer = RobertaTokenizer.from_pretrained(lm_path)
encoder = TFRobertaModel.from_pretrained(lm_path)

2023-04-19 14:38:59.056775: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-19 14:38:59.059463: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. init

##### Tokenize Text and Fit Model to Each Fold

In [9]:
y_probs, y_trues = [], []
for i in range(len(X)):
    
    print(f"Fitting model using fold {i} as out of fold data.")
    
    # Identify train folds and shuffle samples
    X_train, y_train = np.concatenate(X[0:i] + X[i+1:], axis=0), np.concatenate(y[0:i] + y[i+1:], axis=0)
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    X_train, y_train = X_train[indices], y_train[indices]
    
    # Identify test folds
    X_test, y_test = X[i], y[i]

    
    
    
    
    
    
    
    
    
    
    
    
    import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
from transformers import AutoTokenizer, AutoModel

class DocumentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        document_segments, label = self.data[idx]
        input_ids = []
        attention_mask = []
        for segment in document_segments:
            encoding = self.tokenizer.encode_plus(segment, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True)
            input_ids.append(encoding['input_ids'])
            attention_mask.append(encoding['attention_mask'])
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)
        label = torch.tensor(label)
        return input_ids, attention_mask, label

class HierarchicalLSTM(pl.LightningModule):
    def __init__(self, encoder, hidden_size, num_layers, dropout):
        super().__init__()
        self.encoder = encoder
        self.lstm = nn.LSTM(input_size=self.encoder.config.hidden_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.loss_fn = nn.BCEWithLogitsLoss()
        
    def forward(self, input_ids, attention_mask):
        # Get encoder embeddings for each document segment
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state
        
        # Pass embeddings through LSTM
        _, (hidden, _) = self.lstm(embeddings)
        hidden = self.dropout(hidden[-1]) # take the last layer's hidden state
        out = self.fc(hidden)
        return out.squeeze(-1)
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch
        logits = self(input_ids, attention_mask)
        loss = self.loss_fn(logits, label.float())
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch
        logits = self(input_ids, attention_mask)
        loss = self.loss_fn(logits, label.float())
        self.log('val_loss', loss, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=2e-5)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
        return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss'}

class HierarchicalLSTMTrainer(pl.LightningModule):
    def __init__(self, encoder_name, hidden_size, num_layers, dropout, max_length, train_data, val_data, batch_size=16, num_workers=4):
        super().__init__()
        self.encoder_name = encoder_name
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.max_length = max_length
        self.train_data = train_data
        self.val_data = val_data
        self.batch_size = batch_size
        self.num_workers = num_workers
        
    def setup(self, stage=None):
        self.tokenizer = AutoTokenizer.from_pretrained(self.encoder_name)
        self.encoder = AutoModel.from_pretrained(self.encoder_name)
        self.train_dataset = DocumentDataset(self.train_data, self.tokenizer, self.max_length)
        self.val_dataset = DocumentDataset(self.val_data, self.tokenizer, self.max_length)
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
    
    def configure_optimizers(self):
        return HierarchicalLSTM(self.encoder, self.hidden_size, self.num_layers, self.dropout)
    
    def validation_epoch_end(self, outputs):
        avg_val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        self.log('avg_val_loss', avg_val_loss, prog_bar=True)
        return {'avg_val_loss': avg_val_loss}
    
    
    data = [
    (["This is the first segment.", "This is the second segment."], 1),
    (["This is the third segment.", "This is the fourth segment."], 0),
    (["This is the fifth segment.", "This is the sixth segment."], 1)
]

train_data = DocumentDataset(data)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    early_stop_callback = EarlyStopping(
        monitor='avg_val_loss',
        min_delta=0.00,
        patience=3,
        verbose=True,
        mode='min'
    )

    trainer = pl.Trainer(
        gpus=1,
        callbacks=[early_stop_callback],
        max_epochs=100,
    )

    model = HierarchicalLSTMTrainer(
        encoder_name='roberta-large',
        hidden_size=256,
        num_layers=2,
        dropout=0.1,
        max_length=512,
        train_data=train_data,
        val_data=val_data,
        batch_size=16,
        num_workers=4
    )

    trainer.fit(model)
    
    # Predict on test dataset
    y_prob = model.predict(X_test)

    # Save scores and labels
    y_probs.append(y_prob)
    y_trues.append(y_test)

Fitting model using fold 0 as out of fold data.


2023-04-19 14:39:01.454116: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-19 14:39:01.455577: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-19 14:39:01.456916: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

InvalidArgumentError: OpKernel 'ConcatV2' has constraint on attr 'T' not in NodeDef '[N=0, Tidx=DT_INT32]', KernelDef: 'op: "ConcatV2" device_type: "CPU" constraint { name: "T" allowed_values { list { type: DT_QINT32 } } } host_memory_arg: "axis"' [Op:ConcatV2] name: concat

##### Save Model Probabilities on Test Folds and True Labels

In [None]:
with open(os.path.join(results_dir, f"{model_key}_y_trues.pkl"), "wb") as f:
    pickle.dump(y_trues, f)

with open(os.path.join(results_dir, f"{model_key}_y_probs.pkl"), "wb") as f:
    pickle.dump(y_probs, f)