### Train Hierarchical Model

- Joel Stremmel
- 04-24-23

##### About

Train a Hierarchical Model on the formatted data using K-Fold Cross-Validation and save the scores.

##### Imports

In [1]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
)

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl

ModuleNotFoundError: No module named 'pytorch_lightning'

##### Set Parameters

In [None]:
max_utterance_length = 64
max_num_utterances = 32
batch_size = 32
lr = 0.00002
epochs = 5
early_stopping_patience = 5
lm_path = "roberta-base"
model_key = "lstmh"
input_dir = "./data"
results_dir = "./results"

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [None]:
with open(os.path.join(input_dir, "X_folds.pkl"), "rb") as f:
    X_folds = pickle.load(f)

with open(os.path.join(input_dir, "y_folds.pkl"), "rb") as f:
    y_folds = pickle.load(f)

##### Check Data Shape

In [None]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [None]:
print(f"Target prevalance: {np.mean(np.concatenate(y))}.")

##### Check that GPU is Available

In [None]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

##### Load Pretrained Encoder and Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(lm_path)
encoder = AutoModel.from_pretrained(lm_path)

##### Tokenize Text and Fit Model to Each Fold

In [None]:
class DocumentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        document_segments, label = self.data[idx]
        input_ids = []
        attention_mask = []
        for segment in document_segments:
            encoding = self.tokenizer.encode_plus(segment, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True)
            input_ids.append(encoding['input_ids'])
            attention_mask.append(encoding['attention_mask'])
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)
        label = torch.tensor(label)
        return input_ids, attention_mask, label

In [None]:
class HierarchicalLSTM(pl.LightningModule):
    def __init__(self, encoder, hidden_size, num_layers, dropout):
        super().__init__()
        self.encoder = encoder
        self.lstm = nn.LSTM(input_size=self.encoder.config.hidden_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.loss_fn = nn.BCEWithLogitsLoss()
        
    def forward(self, input_ids, attention_mask):
        # Get encoder embeddings for each document segment
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state
        
        # Pass embeddings through LSTM
        _, (hidden, _) = self.lstm(embeddings)
        hidden = self.dropout(hidden[-1]) # take the last layer's hidden state
        out = self.fc(hidden)
        return out.squeeze(-1)
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch
        logits = self(input_ids, attention_mask)
        loss = self.loss_fn(logits, label.float())
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, label = batch
        logits = self(input_ids, attention_mask)
        loss = self.loss_fn(logits, label.float())
        self.log('val_loss', loss, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=2e-5)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
        return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss'}

In [None]:
class HierarchicalLSTMTrainer(pl.LightningModule):
    def __init__(self, encoder_name, hidden_size, num_layers, dropout, max_length, train_data, val_data, batch_size=16, num_workers=4):
        super().__init__()
        self.encoder_name = encoder_name
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.max_length = max_length
        self.train_data = train_data
        self.val_data = val_data
        self.batch_size = batch_size
        self.num_workers = num_workers
        
    def setup(self, stage=None):
        self.tokenizer = AutoTokenizer.from_pretrained(self.encoder_name)
        self.encoder = AutoModel.from_pretrained(self.encoder_name)
        self.train_dataset = DocumentDataset(self.train_data, self.tokenizer, self.max_length)
        self.val_dataset = DocumentDataset(self.val_data, self.tokenizer, self.max_length)
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
    
    def configure_optimizers(self):
        return HierarchicalLSTM(self.encoder, self.hidden_size, self.num_layers, self.dropout)
    
    def validation_epoch_end(self, outputs):
        avg_val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        self.log('avg_val_loss', avg_val_loss, prog_bar=True)
        return {'avg_val_loss': avg_val_loss}

In [None]:
y_probs, y_trues = [], []
for i in range(len(X)):
    
    print(f"Fitting model using fold {i} as out of fold data.")
    
    # Identify train folds and shuffle samples
    X_train, y_train = np.concatenate(X[0:i] + X[i + 1 :], axis=0), np.concatenate(
        y[0:i] + y[i + 1 :], axis=0
    )
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    X_train, y_train = X_train[indices], y_train[indices]

    # Select a validation fold at random
    indices_temp = np.arange(len(y_train_temp))
    val_index = np.random.choice(indices_temp)
    X_val, y_val = X_train_temp[val_index], y_train_temp[val_index]

    # Identify the training folds as the indices not including the validation index
    # Concatenate all examples in the training folds to form the full training set
    X_train = np.concatenate(np.delete(X_train_temp, val_index), axis=0)
    y_train = np.concatenate(np.delete(y_train_temp, val_index), axis=0)

    # Shuffle training data
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    X_train, y_train = X_train[indices], y_train[indices]
    
    # Print data shapes
    print(f"Train data sizes: {len(X_train), len(y_train)}.")
    print(f"Val data sizes: {len(X_val), len(y_val)}.")
    print(f"Test data sizes: {len(X_test), len(y_test)}.")

    # Format text and label data as HuggingFace dataset
    train_dataset = DocumentDataset(X_train, y_train)
    val_dataset = DocumentDataset(X_val, y_val)
    test_dataset = DocumentDataset(X_test, y_test) 

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    
    early_stop_callback = EarlyStopping(
        monitor='avg_val_loss',
        min_delta=0.00,
        patience=early_stopping_patience,
        verbose=True,
        mode='min'
    )

    trainer = pl.Trainer(
        gpus=1,
        callbacks=[early_stop_callback],
        max_epochs=epochs,
    )

    model = HierarchicalLSTMTrainer(
        encoder_name='roberta-base',
        hidden_size=256,
        num_layers=2,
        dropout=0.1,
        max_length=512,
        train_data=train_data,
        val_data=val_data,
        batch_size=16,
        num_workers=4
    )

    trainer.fit(model)
    
    # Predict on test dataset
    y_pred = model.predict(test_dataloader)
    y_prob = torch.sigmoid(torch.tensor(y_pred)).numpy()[:, 1]

    # Save scores and labels
    y_probs.append(y_prob)
    y_trues.append(y_test)

##### Save Model Probabilities on Test Folds and True Labels

In [None]:
with open(os.path.join(results_dir, f"{model_key}_y_trues.pkl"), "wb") as f:
    pickle.dump(y_trues, f)

with open(os.path.join(results_dir, f"{model_key}_y_probs.pkl"), "wb") as f:
    pickle.dump(y_probs, f)