# Ensemble

Feed the final hidden states from Longformer and the final hidden states from Specter into a fully-connected feedforward network for token-level classification

## Imports

In [1]:
import tensorflow as tf
from transformers import LongformerTokenizerFast, AutoTokenizer, TFLongformerModel, TFAutoModel

import numpy as np
import json

from sklearn import metrics
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve, roc_curve

In [2]:
# Pull in data

DEV_MASKS_FILE =    "../data/processed/jg_dev_masks.json"
TRAIN_MASKS_FILE =  "../data/processed/jg_train_masks.json"
TEST_MASKS_FILE =   "../data/processed/jg_test_masks.json"

with open("../data/raw/text-anonymization-benchmark/echr_dev.json") as file:
    dev_file = json.load(file)

with open(DEV_MASKS_FILE) as file:
    dev_masks = json.load(file)
    
with open("../data/raw/text-anonymization-benchmark/echr_train.json") as file:
    train_file = json.load(file)
    
with open(TRAIN_MASKS_FILE) as file:
    train_masks = json.load(file)
       
with open("../data/raw/text-anonymization-benchmark/echr_test.json") as file:
    test_file = json.load(file)

with open(TEST_MASKS_FILE) as file:
    test_masks = json.load(file)

## Helper Functions

In [2]:
# Function used to label data

def label_tokens(toks, offs, spans_to_mask):
    """Args: 
            toks - list of token id's
            offs - list of char offsets for each token
       Returns:
            label_list - 0 for non_mask, 1 for mask"""
    
    label_list = []
    mapping_list = []
    
    # Map token_ids back to string
    
    for token, pos in zip(toks, offs):
        mapping_list.append([token, pos[0], pos[1]])
    
    # Determine if each token should be masked
    spans_to_mask.sort(key=lambda tup: tup[0]) #order spans, ascending
    
    j=0
    
    for i in range(len(mapping_list)):
        
        temp_list = []
        stop=False
        
        while not stop and j < len(spans_to_mask):
            
            if (mapping_list[i][1] >= spans_to_mask[j][0]) and (mapping_list[i][2] <= spans_to_mask[j][1]):
                temp_list.append(1)
            else:
                temp_list.append(0)           

            # Since spans and mapping_list are ordered, break to allow it to catch up
            if(spans_to_mask[j][1] > mapping_list[i][2]):
                stop=True
            else:
                j = j+1
            
        if sum(temp_list) >= 1:
            label_list.append(1)
        else:
            label_list.append(0)
        
    
    return label_list  

## Longformer
### Create Labels and Tokenize Input

In [3]:
# Pull in tokenizer

tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")

In [5]:
# Create labels

dev_text = []
dev_labels = []

for i in range(len(dev_file)):
    doc_id = dev_file[i]["doc_id"]
    spans_to_mask = dev_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = dev_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    dev_text.append(doc_text)
    dev_labels.append(label_tokens(tokens, offsets, spans_to_mask))
    
train_text = []
train_labels = []

for i in range(len(train_file)):
    doc_id = train_file[i]["doc_id"]
    spans_to_mask = train_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = train_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    
    train_text.append(doc_text)
    train_labels.append(label_tokens(tokens, offsets, spans_to_mask))

test_text = []
test_labels = []

for i in range(len(test_file)):
    doc_id = test_file[i]["doc_id"]
    spans_to_mask = test_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = test_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    
    test_text.append(doc_text)
    test_labels.append(label_tokens(tokens, offsets, spans_to_mask))

2022-11-25 21:23:33.391880: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# Pad labels to max length

MAX_LEN = 4096

for i in range(len(dev_labels)):
    curr_len = len(dev_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        dev_labels[i].extend(to_add)
        
for i in range(len(train_labels)):
    curr_len = len(train_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        train_labels[i].extend(to_add)
        
for i in range(len(test_labels)):
    curr_len = len(test_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        test_labels[i].extend(to_add)
        
dev_labels = np.asarray(dev_labels)
train_labels = np.asarray(train_labels)
test_labels = np.asarray(test_labels)

In [7]:
# Tokenize input

dev_text_tokenized = tokenizer(dev_text, truncation=True, padding="max_length", return_tensors="tf")
train_text_tokenized = tokenizer(train_text, truncation=True, padding="max_length", return_tensors="tf")
test_text_tokenized = tokenizer(test_text, truncation=True, padding="max_length", return_tensors="tf")

## Specter
### Tokenize Input

In [4]:
# Pull in tokenizer

specter_tokenizer = AutoTokenizer.from_pretrained('allenai/specter')

In [9]:
# Tokenize text

dev_text_tokenized_specter = specter_tokenizer(dev_text, truncation=True, max_length=512, padding="max_length", return_tensors="tf")
train_text_tokenized_specter = specter_tokenizer(train_text, truncation=True, max_length=512, padding="max_length", return_tensors="tf")
test_text_tokenized_specter = specter_tokenizer(test_text, truncation=True, max_length=512, padding="max_length", return_tensors="tf")

## Build Model

In [10]:
def create_ensemble_model(dropout=0.3,
                            learning_rate=0.00002):
    
    """Build FFNN on top of concatenated hidden states"""
    
    longformer_model = TFLongformerModel.from_pretrained("allenai/longformer-base-4096")
    specter_model = TFAutoModel.from_pretrained('allenai/specter')
    
    longformer_model.trainable = True
    specter_model.trainable = True
    
    input_ids_longformer = tf.keras.layers.Input(shape=(4096,), dtype=tf.int32, name="longformer_input") # max_len for Longformer
    input_ids_specter = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="specter_input") # max_len for Specter
    
    longformer_out = longformer_model(input_ids_longformer).last_hidden_state
    specter_out = specter_model(input_ids_specter).last_hidden_state
    
    identity = tf.eye(4096, 512)
    specter_padded = tf.matmul(identity, specter_out) # Reshape to (None, 4096, 768) for concatenation
    concatenated_input = tf.keras.layers.Concatenate(axis=2, name="final_concat")([longformer_out, specter_padded])
    
    hidden_1 = tf.keras.layers.Dense(512, activation="relu", name="hidden_1")(concatenated_input)
    hidden_1 = tf.keras.layers.Dropout(dropout)(hidden_1)
    hidden_2 = tf.keras.layers.Dense(256, activation="relu", name="hidden_2")(hidden_1)
    hidden_2 = tf.keras.layers.Dropout(dropout)(hidden_2)
    
    classification = tf.keras.layers.Dense(2, activation="sigmoid", name="classification_layer")(hidden_2)
    
    classification_model = tf.keras.Model(inputs=[input_ids_longformer, input_ids_specter], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), 
                                 metrics="accuracy")
    
    return classification_model

In [11]:
model = create_ensemble_model()

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerModel: ['lm_head']
- This IS expected if you are initializing TFLongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFLongformerModel were initialized from the model checkpoint at allenai/longformer-base-4096.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the 

In [12]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 specter_input (InputLayer)     [(None, 512)]        0           []                               
                                                                                                  
 longformer_input (InputLayer)  [(None, 4096)]       0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109938432   ['specter_input[0][0]']          
                                thPoolingAndCrossAt                                               
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [13]:
history = model.fit([train_text_tokenized["input_ids"], train_text_tokenized_specter["input_ids"]],
                    train_labels,
                    batch_size=1,
                    epochs=2)

Epoch 1/2


2022-11-25 21:25:55.888039: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: model/tf_longformer_model/longformer/encoder/layer_._2/attention/self/cond_2/branch_executed/_277


Epoch 2/2


In [14]:
# Save model

model.save("../models/ensemble.h5", save_format="tf")

In [15]:
print(history.history)

{'loss': [0.050023335963487625, 0.03573835641145706], 'accuracy': [0.9810535907745361, 0.9860784411430359]}


# Assess Performance

In [16]:
preds = model([test_text_tokenized["input_ids"], test_text_tokenized_specter["input_ids"]])

In [17]:
predicted_token_class_ids = tf.math.argmax(preds, axis=-1)

In [18]:
predicted_token_class_ids

<tf.Tensor: shape=(127, 4096), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>

In [19]:
# Save predicted_token_class_ids to avoid running inference again

np.savetxt("../predictions/ensemble_tab_test_set.txt", predicted_token_class_ids)

## Wikipedia
### Prepare Data

In [5]:
# Load wiki data

with open("../data/raw/wiki-summaries/annotated_wikipedia.json") as file:
    wiki_file = json.load(file)

with open("../data/processed/wiki_masks.json") as file:
    wiki_masks = json.load(file)

In [6]:
# Create labels

wiki_text = []
wiki_labels = []

for i in range(len(wiki_file)):
    doc_id = wiki_file[i]["doc_id"]
    spans_to_mask = wiki_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = wiki_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    wiki_text.append(doc_text)
    wiki_labels.append(label_tokens(tokens, offsets, spans_to_mask))

2022-11-26 17:02:54.240638: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
# Pad labels to max length

MAX_LEN = 4096

for i in range(len(wiki_labels)):
    curr_len = len(wiki_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        wiki_labels[i].extend(to_add)
        
wiki_labels = np.asarray(wiki_labels)

In [8]:
# Tokenize input

wiki_text_tokenized = tokenizer(wiki_text, truncation=True, padding="max_length", return_tensors="tf")

In [9]:
# Tokenize input for Specter

wiki_text_tokenized_specter = specter_tokenizer(wiki_text, truncation=True, max_length=512, padding="max_length", return_tensors="tf")

### Generate Predictions

In [10]:
# Load these because they're custom objects

longformer_model = TFLongformerModel.from_pretrained("allenai/longformer-base-4096")
specter_model = TFAutoModel.from_pretrained('allenai/specter')

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerModel: ['lm_head']
- This IS expected if you are initializing TFLongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFLongformerModel were initialized from the model checkpoint at allenai/longformer-base-4096.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the 

In [11]:
model = tf.keras.models.load_model("../models/ensemble.h5", custom_objects={"TFBertModel": specter_model, "TFLongformerModel": longformer_model})



In [12]:
preds = model([wiki_text_tokenized["input_ids"], wiki_text_tokenized_specter["input_ids"]])

2022-11-26 17:22:44.052083: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 104375255040 exceeds 10% of free system memory.
2022-11-26 17:23:42.512935: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 104579112960 exceeds 10% of free system memory.
2022-11-26 17:23:59.657352: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 104579112960 exceeds 10% of free system memory.
2022-11-26 17:25:41.711498: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 83717652480 exceeds 10% of free system memory.
2022-11-26 17:25:47.004886: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 83608928256 exceeds 10% of free system memory.


In [13]:
predicted_token_class_ids = tf.math.argmax(preds, axis=-1)

In [14]:
predicted_token_class_ids

<tf.Tensor: shape=(553, 4096), dtype=int64, numpy=
array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>

In [15]:
# Save predicted_token_class_ids to avoid running inference again

np.savetxt("../predictions/ensemble_wiki_preds.txt", predicted_token_class_ids)

# Calculate Precision, Recall, and AUC

In [16]:
def calc_precision(pred_list, label_list):
    """Calculates precision of batch of predictions"""
    
    tp = 0
    fp = 0
    
    for i in range(len(pred_list)):
        for j in range(len(pred_list[i])):
        
            if pred_list[i][j] == 1:
                if label_list[i][j] == 1:
                    tp += 1
                else:
                    fp += 1
            else:
                continue
                
    return tp / (tp + fp)

In [17]:
def calc_recall(pred_list, label_list):
    """Calculates recall of batch of predictions"""
    
    tp = 0 
    fn = 0
    
    for i in range(len(pred_list)):
        for j in range(len(pred_list[i])):
            
            if pred_list[i][j] == 1:
                if label_list[i][j] == 1:
                    tp += 1
                else:
                    tp += 0
                
            else:
                if label_list[i][j] == 1:
                    fn += 1
                else:
                    fn += 0
    
    return tp / (tp + fn)

## Test Set - Metrics

In [22]:
precision = calc_precision(predicted_token_class_ids, test_labels)
print (f' Token level precision: {precision}')

 Token level precision: 0.955564228367529


In [23]:
recall = calc_recall(predicted_token_class_ids, test_labels)
print (f' Token level recall: {recall}')

 Token level recall: 0.7242953133584077


In [24]:
# Multilabel indicators are not supported in sklearn for AUC
# Loop through preds and take avg of AUC

auc = []

for i in range(len(predicted_token_class_ids)):
    
    fpr, tpr, thresholds = metrics.roc_curve(test_labels[i], predicted_token_class_ids[i], pos_label=1)
    auc.append(metrics.auc(fpr, tpr))

auc = sum(auc)/len(auc)
print (f' Average AUC: {auc}')

 Average AUC: 0.8771953319725475


## Wikipedia - Metrics

In [18]:
precision = calc_precision(predicted_token_class_ids, wiki_labels)
print (f' Token level precision: {precision}')

 Token level precision: 0.8947179464165508


In [19]:
recall = calc_recall(predicted_token_class_ids, wiki_labels)
print (f' Token level recall: {recall}')

 Token level recall: 0.8001364530240398


In [20]:
# Multilabel indicators are not supported in sklearn for AUC
# Loop through preds and take avg of AUC

auc = []

for i in range(len(predicted_token_class_ids)):
    
    fpr, tpr, thresholds = metrics.roc_curve(wiki_labels[i], predicted_token_class_ids[i], pos_label=1)
    auc.append(metrics.auc(fpr, tpr))

auc = sum(auc)/len(auc)
print (f' Average AUC: {auc}')

 Average AUC: 0.9140386819371921
