In [38]:
import tensorflow as tf
from transformers import LongformerTokenizerFast, TFLongformerModel, TFBertModel, LongformerConfig, BertConfig, TFBertTokenizer

import numpy as np
import json

In [4]:
# Pull in data

DEV_MASKS_FILE =    "../data/processed/jg_dev_masks.json"
TRAIN_MASKS_FILE =  "../data/processed/jg_train_masks.json"
TEST_MASKS_FILE =   "../data/processed/jg_test_masks.json"

with open("../data/raw/text-anonymization-benchmark/echr_dev.json") as file:
    dev_file = json.load(file)

with open(DEV_MASKS_FILE) as file:
    dev_masks = json.load(file)
    
with open("../data/raw/text-anonymization-benchmark/echr_train.json") as file:
    train_file = json.load(file)
    
with open(TRAIN_MASKS_FILE) as file:
    train_masks = json.load(file)
       
with open("../data/raw/text-anonymization-benchmark/echr_test.json") as file:
    test_file = json.load(file)

with open(TEST_MASKS_FILE) as file:
    test_masks = json.load(file)

In [11]:
# Function used to label data

def label_tokens(toks, offs, spans_to_mask):
    """Args: 
            toks - list of token id's
            offs - list of char offsets for each token
       Returns:
            label_list - 0 for non_mask, 1 for mask"""
    
    label_list = []
    mapping_list = []
    
    # Map token_ids back to string
    
    for token, pos in zip(toks, offs):
        mapping_list.append([token, pos[0], pos[1]])
    
    # Determine if each token should be masked
    spans_to_mask.sort(key=lambda tup: tup[0]) #order spans, ascending
    
    j=0
    
    for i in range(len(mapping_list)):
        
        temp_list = []
        stop=False
        
        while not stop and j < len(spans_to_mask):
            
            if (mapping_list[i][1] >= spans_to_mask[j][0]) and (mapping_list[i][2] <= spans_to_mask[j][1]):
                temp_list.append(1)
            else:
                temp_list.append(0)           

            # Since spans and mapping_list are ordered, break to allow it to catch up
            if(spans_to_mask[j][1] > mapping_list[i][2]):
                stop=True
            else:
                j = j+1
            
        if sum(temp_list) >= 1:
            label_list.append(1)
        else:
            label_list.append(0)
        
    
    return label_list  

# Prepare example

In [40]:
long_tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
bert_tokenizer = TFBertTokenizer.from_pretrained("bert-base-cased")

ImportError: 
TFBertTokenizer requires the tensorflow_text library but it was not found in your environment. You can install it with pip as
explained here: https://www.tensorflow.org/text/guide/tf_text_intro.


In [13]:
# Create labels

dev_text = []
dev_labels = []

for i in range(len(dev_file)):
    doc_id = dev_file[i]["doc_id"]
    spans_to_mask = dev_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = dev_file[i]["text"]
    tok_tensor = long_tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    dev_text.append(doc_text)
    dev_labels.append(label_tokens(tokens, offsets, spans_to_mask))

In [14]:
# Pad labels to max length

MAX_LEN = 4096

for i in range(len(dev_labels)):
    curr_len = len(dev_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        dev_labels[i].extend(to_add)

In [15]:
dev_labels = np.asarray(dev_labels)

In [17]:
dev_text_tokenized = long_tokenizer(dev_text, truncation=True, padding="max_length", return_tensors="tf")

In [27]:
long_input = dev_text_tokenized["input_ids"][0:2]

In [30]:
configuration = LongformerConfig(hidden_size=768)
model = TFLongformerModel(configuration)

In [31]:
long_out = model(long_input)

In [35]:
# This will be concatenated with bert_out

long_out.last_hidden_state

<tf.Tensor: shape=(2, 4096, 768), dtype=float32, numpy=
array([[[ 1.2120625 , -0.05847552, -1.6469035 , ..., -1.4357332 ,
         -0.09620433,  0.04101687],
        [ 1.210464  , -0.0719308 , -3.7255435 , ..., -1.2456849 ,
          0.5807308 ,  0.02675109],
        [ 0.49896082, -0.6654972 , -1.4830106 , ..., -2.4679806 ,
         -0.53557163,  0.7778685 ],
        ...,
        [ 1.1300122 , -1.8451174 , -0.6084656 , ..., -1.35957   ,
          0.71025735,  0.59018344],
        [ 1.1300173 , -1.8451171 , -0.6084657 , ..., -1.3595695 ,
          0.7102577 ,  0.5901856 ],
        [ 1.1300141 , -1.8451166 , -0.60846573, ..., -1.3595701 ,
          0.7102593 ,  0.5901875 ]],

       [[ 1.1743386 , -0.09056882, -1.6945956 , ..., -1.4370364 ,
         -0.10498749,  0.04563296],
        [ 1.1866105 , -0.09132639, -3.768911  , ..., -1.2406118 ,
          0.5697771 ,  0.04183936],
        [ 0.46857306, -0.687666  , -1.527316  , ..., -2.4570878 ,
         -0.5353244 ,  0.79489774],
        ...

In [48]:
run_sum = 0
for i in range(len(dev_text_tokenized["attention_mask"])):
    run_sum += dev_text_tokenized["attention_mask"][i].numpy().sum()
    
print ("Average number of tokens in TAB dev set: ", run_sum/127)



Average number of tokens in TAB dev set:  1146.1574803149606


In [None]:
test_inputs_long = model(input_ids=None, inputs_embeds=embeds)

In [18]:
embeds = np.random.rand(2, 4096, 780)

In [19]:
model(input_ids=None, inputs_embeds=embeds)

TFLongformerBaseModelOutputWithPooling(last_hidden_state=<tf.Tensor: shape=(2, 4096, 780), dtype=float32, numpy=
array([[[-0.49232626, -0.559248  , -1.2002027 , ...,  0.2702738 ,
          0.14512879,  2.6727633 ],
        [-0.5254935 ,  0.82954466,  1.1130804 , ...,  0.9501388 ,
         -0.35745513,  2.556387  ],
        [-0.7506366 ,  0.82961226, -1.3330721 , ...,  0.56662714,
          2.1735842 ,  1.2026347 ],
        ...,
        [-0.20581056, -0.08502284, -0.6277318 , ..., -0.35515654,
          1.5830257 ,  0.5219814 ],
        [ 0.8646531 , -0.90375865, -2.0135632 , ..., -0.3482434 ,
         -0.67312104,  1.8517795 ],
        [ 1.2273629 ,  1.0311047 ,  0.33809602, ...,  0.9056321 ,
         -1.1252242 ,  1.626058  ]],

       [[ 1.2560356 , -1.0362701 , -1.2846804 , ...,  0.25536045,
          1.4691763 ,  2.2017922 ],
        [ 1.778084  ,  0.07749991, -0.58557487, ..., -0.41259348,
         -0.08013434,  1.694983  ],
        [ 1.2428657 ,  0.98604006, -0.7024545 , ...,  1.

# To get Longformer to accept and use different dimensional embeddings,

The dimension of the hidden_size needs to be a multiple of 12 and
You need to update the config class as above