# Longformer with FFNN

Instead of feeding hidden states from Longformer to a linear inference layer, feed them to a multilayer feedforward neural network

## Imports

In [136]:
import tensorflow as tf
from transformers import LongformerTokenizerFast, TFLongformerModel

import numpy as np
import json

from sklearn import metrics
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve, roc_curve

In [51]:
# Pull in tokenizer and model

tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
model = TFLongformerModel.from_pretrained("allenai/longformer-base-4096")

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerModel: ['lm_head']
- This IS expected if you are initializing TFLongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFLongformerModel were initialized from the model checkpoint at allenai/longformer-base-4096.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.


In [85]:
# Pull in data

DEV_MASKS_FILE =    "../data/processed/jg_dev_masks.json"
TRAIN_MASKS_FILE =  "../data/processed/jg_train_masks.json"
TEST_MASKS_FILE =   "../data/processed/jg_test_masks.json"

with open("../data/raw/text-anonymization-benchmark/echr_dev.json") as file:
    dev_file = json.load(file)

with open(DEV_MASKS_FILE) as file:
    dev_masks = json.load(file)
    
with open("../data/raw/text-anonymization-benchmark/echr_train.json") as file:
    train_file = json.load(file)
    
with open(TRAIN_MASKS_FILE) as file:
    train_masks = json.load(file)
       
with open("../data/raw/text-anonymization-benchmark/echr_test.json") as file:
    test_file = json.load(file)

with open(TEST_MASKS_FILE) as file:
    test_masks = json.load(file)

## Helper Functions

In [86]:
# Function used to label data

def label_tokens(toks, offs, spans_to_mask):
    """Args: 
            toks - list of token id's
            offs - list of char offsets for each token
       Returns:
            label_list - 0 for non_mask, 1 for mask"""
    
    label_list = []
    mapping_list = []
    
    # Map token_ids back to string
    
    for token, pos in zip(toks, offs):
        mapping_list.append([token, pos[0], pos[1]])
    
    # Determine if each token should be masked
    spans_to_mask.sort(key=lambda tup: tup[0]) #order spans, ascending
    
    j=0
    
    for i in range(len(mapping_list)):
        
        temp_list = []
        stop=False
        
        while not stop and j < len(spans_to_mask):
            
            if (mapping_list[i][1] >= spans_to_mask[j][0]) and (mapping_list[i][2] <= spans_to_mask[j][1]):
                temp_list.append(1)
            else:
                temp_list.append(0)           

            # Since spans and mapping_list are ordered, break to allow it to catch up
            if(spans_to_mask[j][1] > mapping_list[i][2]):
                stop=True
            else:
                j = j+1
            
        if sum(temp_list) >= 1:
            label_list.append(1)
        else:
            label_list.append(0)
        
    
    return label_list  

## Create Labels and Tokenize Input

In [87]:
# Create labels

dev_text = []
dev_labels = []

for i in range(len(dev_file)):
    doc_id = dev_file[i]["doc_id"]
    spans_to_mask = dev_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = dev_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    dev_text.append(doc_text)
    dev_labels.append(label_tokens(tokens, offsets, spans_to_mask))
    
train_text = []
train_labels = []

for i in range(len(train_file)):
    doc_id = train_file[i]["doc_id"]
    spans_to_mask = train_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = train_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    
    train_text.append(doc_text)
    train_labels.append(label_tokens(tokens, offsets, spans_to_mask))

test_text = []
test_labels = []

for i in range(len(test_file)):
    doc_id = test_file[i]["doc_id"]
    spans_to_mask = test_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = test_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    
    test_text.append(doc_text)
    test_labels.append(label_tokens(tokens, offsets, spans_to_mask))

In [88]:
# Pad labels to max length

MAX_LEN = 4096

for i in range(len(dev_labels)):
    curr_len = len(dev_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        dev_labels[i].extend(to_add)
        
for i in range(len(train_labels)):
    curr_len = len(train_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        train_labels[i].extend(to_add)
        
for i in range(len(test_labels)):
    curr_len = len(test_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        test_labels[i].extend(to_add)
        
dev_labels = np.asarray(dev_labels)
train_labels = np.asarray(train_labels)
test_labels = np.asarray(test_labels)

In [89]:
# Tokenize input

dev_text_tokenized = tokenizer(dev_text, truncation=True, padding="max_length", return_tensors="tf")
train_text_tokenized = tokenizer(train_text, truncation=True, padding="max_length", return_tensors="tf")
test_text_tokenized = tokenizer(test_text, truncation=True, padding="max_length", return_tensors="tf")

## Build Model

In [105]:
def create_longformer_model(dropout=0.3,
                            learning_rate=0.00002):
    
    """Build FFNN on top of Longformer hidden states"""
    
    longformer_model = TFLongformerModel.from_pretrained("allenai/longformer-base-4096")
    longformer_model.trainable = True
    
    input_ids = tf.keras.layers.Input(shape=(4096,), dtype=tf.int32, name="input_ids_layer") # max_len for Longformer
    longformer_out = longformer_model(input_ids).last_hidden_state
    
    hidden_1 = tf.keras.layers.Dense(512, activation="relu", name="hidden_1")(longformer_out)
    hidden_1 = tf.keras.layers.Dropout(dropout)(hidden_1)
    hidden_2 = tf.keras.layers.Dense(256, activation="relu", name="hidden_2")(hidden_1)
    hidden_2 = tf.keras.layers.Dropout(dropout)(hidden_2)
    
    classification = tf.keras.layers.Dense(2, activation="sigmoid", name="classification_layer")(hidden_2)
    
    classification_model = tf.keras.Model(inputs=[input_ids], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), 
                                 metrics="accuracy")
    
    return classification_model

In [106]:
model = create_longformer_model()

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerModel: ['lm_head']
- This IS expected if you are initializing TFLongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFLongformerModel were initialized from the model checkpoint at allenai/longformer-base-4096.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.


In [115]:
history = model.fit(train_text_tokenized["input_ids"],
                    train_labels,
                    batch_size=1,
                    epochs=2)

Epoch 1/2


2022-11-12 22:59:42.116628: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: model_6/tf_longformer_model_9/longformer/encoder/layer_._9/attention/self/cond_2/branch_executed/_1009


Epoch 2/2


In [151]:
# Save model

model.save("../models/longformer_ffnn.h5", save_format="tf")

In [150]:
print(history.history)

{'loss': [0.05220803618431091, 0.03706428408622742], 'accuracy': [0.9804208278656006, 0.9854947924613953]}


In [149]:
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids_layer (InputLayer  [(None, 4096)]           0         
 )                                                               
                                                                 
 tf_longformer_model_9 (TFLo  TFLongformerBaseModelOut  148659456
 ngformerModel)              putWithPooling(last_hidd            
                             en_state=(None, 4096, 76            
                             8),                                 
                              pooler_output=(None, 76            
                             8),                                 
                              hidden_states=None, att            
                             entions=None, global_att            
                             entions=None)                       
                                                           

# Assess Performance

In [123]:
preds = model(test_text_tokenized["input_ids"])

In [124]:
predicted_token_class_ids = tf.math.argmax(preds, axis=-1)

In [128]:
predicted_token_class_ids

<tf.Tensor: shape=(127, 4096), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>

In [130]:
# Save predicted_token_class_ids to avoid running inference again

np.savetxt("../predictions/longformer_ffnn_preds_tab_test_set.txt", predicted_token_class_ids)

## Wikipedia
### Prepare Data

In [138]:
# Load wiki data

with open("../data/raw/wiki-summaries/annotated_wikipedia.json") as file:
    wiki_file = json.load(file)

with open("../data/processed/wiki_masks.json") as file:
    wiki_masks = json.load(file)

In [139]:
# Create labels

wiki_text = []
wiki_labels = []

for i in range(len(wiki_file)):
    doc_id = wiki_file[i]["doc_id"]
    spans_to_mask = wiki_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = wiki_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    wiki_text.append(doc_text)
    wiki_labels.append(label_tokens(tokens, offsets, spans_to_mask))

In [140]:
# Pad labels to max length

MAX_LEN = 4096

for i in range(len(wiki_labels)):
    curr_len = len(wiki_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        wiki_labels[i].extend(to_add)
        
wiki_labels = np.asarray(wiki_labels)

In [141]:
# Tokenize input

wiki_text_tokenized = tokenizer(wiki_text, truncation=True, padding="max_length", return_tensors="tf")

### Generate Predictions

In [142]:
preds = model(wiki_text_tokenized["input_ids"])

2022-11-13 08:52:40.950143: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 104375255040 exceeds 10% of free system memory.
2022-11-13 08:53:37.473129: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 104579112960 exceeds 10% of free system memory.
2022-11-13 08:53:53.126663: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 104579112960 exceeds 10% of free system memory.
2022-11-13 08:55:34.232036: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 83717652480 exceeds 10% of free system memory.
2022-11-13 08:55:39.582864: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 83608928256 exceeds 10% of free system memory.


In [143]:
predicted_token_class_ids = tf.math.argmax(preds, axis=-1)

In [144]:
predicted_token_class_ids

<tf.Tensor: shape=(553, 4096), dtype=int64, numpy=
array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0]])>

In [145]:
# Save predicted_token_class_ids to avoid running inference again

np.savetxt("../predictions/longformer_ffnn_preds.txt", predicted_token_class_ids)

# Calculate Precision, Recall, and AUC

In [131]:
def calc_precision(pred_list, label_list):
    """Calculates precision of batch of predictions"""
    
    tp = 0
    fp = 0
    
    for i in range(len(pred_list)):
        for j in range(len(pred_list[i])):
        
            if pred_list[i][j] == 1:
                if label_list[i][j] == 1:
                    tp += 1
                else:
                    fp += 1
            else:
                continue
                
    return tp / (tp + fp)

In [132]:
def calc_recall(pred_list, label_list):
    """Calculates recall of batch of predictions"""
    
    tp = 0 
    fn = 0
    
    for i in range(len(pred_list)):
        for j in range(len(pred_list[i])):
            
            if pred_list[i][j] == 1:
                if label_list[i][j] == 1:
                    tp += 1
                else:
                    tp += 0
                
            else:
                if label_list[i][j] == 1:
                    fn += 1
                else:
                    fn += 0
    
    return tp / (tp + fn)

## Test Set - Metrics

In [133]:
precision = calc_precision(predicted_token_class_ids, test_labels)
print (f' Token level precision: {precision}')

 Token level precision: 0.9460919353967482


In [134]:
recall = calc_recall(predicted_token_class_ids, test_labels)
print (f' Token level recall: {recall}')

 Token level recall: 0.7401850990998605


In [137]:
# Multilabel indicators are not supported in sklearn for AUC
# Loop through preds and take avg of AUC

auc = []

for i in range(len(predicted_token_class_ids)):
    
    fpr, tpr, thresholds = metrics.roc_curve(test_labels[i], predicted_token_class_ids[i], pos_label=1)
    auc.append(metrics.auc(fpr, tpr))

auc = sum(auc)/len(auc)
print (f' Average AUC: {auc}')

 Average AUC: 0.8846555526884508


## Wikipedia - Metrics

In [152]:
precision = calc_precision(predicted_token_class_ids, wiki_labels)
print (f' Token level precision: {precision}')

 Token level precision: 0.9033591485722431


In [153]:
recall = calc_recall(predicted_token_class_ids, wiki_labels)
print (f' Token level recall: {recall}')

 Token level recall: 0.7630533370791026


In [154]:
# Multilabel indicators are not supported in sklearn for AUC
# Loop through preds and take avg of AUC

auc = []

for i in range(len(predicted_token_class_ids)):
    
    fpr, tpr, thresholds = metrics.roc_curve(wiki_labels[i], predicted_token_class_ids[i], pos_label=1)
    auc.append(metrics.auc(fpr, tpr))

auc = sum(auc)/len(auc)
print (f' Average AUC: {auc}')

 Average AUC: 0.8875962720281091
