# Baseline: Longformer

## Imports

In [47]:
import tensorflow as tf
from transformers import LongformerTokenizerFast, TFLongformerForTokenClassification

import numpy as np
import json

from sklearn import metrics
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve, roc_curve

In [2]:
# Pull in tokenizer and model

tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
model = TFLongformerForTokenClassification.from_pretrained("allenai/longformer-base-4096")

2022-11-12 17:35:18.108094: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerForTokenClassification: ['lm_head']
- This IS expected if you are initializing TFLongformerForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model

In [24]:
# Pull in data

DEV_MASKS_FILE =    "../data/processed/jg_dev_masks.json"
TRAIN_MASKS_FILE =  "../data/processed/jg_train_masks.json"
TEST_MASKS_FILE =   "../data/processed/jg_test_masks.json"

with open("../data/raw/text-anonymization-benchmark/echr_dev.json") as file:
    dev_file = json.load(file)

with open(DEV_MASKS_FILE) as file:
    dev_masks = json.load(file)
    
with open("../data/raw/text-anonymization-benchmark/echr_train.json") as file:
    train_file = json.load(file)
    
with open(TRAIN_MASKS_FILE) as file:
    train_masks = json.load(file)
       
with open("../data/raw/text-anonymization-benchmark/echr_test.json") as file:
    test_file = json.load(file)

with open(TEST_MASKS_FILE) as file:
    test_masks = json.load(file)

## Helper Functions

In [25]:
# Function used to label data

def label_tokens(toks, offs, spans_to_mask):
    """Args: 
            toks - list of token id's
            offs - list of char offsets for each token
       Returns:
            label_list - 0 for non_mask, 1 for mask"""
    
    label_list = []
    mapping_list = []
    
    # Map token_ids back to string
    
    for token, pos in zip(toks, offs):
        mapping_list.append([token, pos[0], pos[1]])
    
    # Determine if each token should be masked
    spans_to_mask.sort(key=lambda tup: tup[0]) #order spans, ascending
    
    j=0
    
    for i in range(len(mapping_list)):
        
        temp_list = []
        stop=False
        
        while not stop and j < len(spans_to_mask):
            
            if (mapping_list[i][1] >= spans_to_mask[j][0]) and (mapping_list[i][2] <= spans_to_mask[j][1]):
                temp_list.append(1)
            else:
                temp_list.append(0)           

            # Since spans and mapping_list are ordered, break to allow it to catch up
            if(spans_to_mask[j][1] > mapping_list[i][2]):
                stop=True
            else:
                j = j+1
            
        if sum(temp_list) >= 1:
            label_list.append(1)
        else:
            label_list.append(0)
        
    
    return label_list  

## Validate Data Labeling Function

In [5]:
import unittest

class TestNotebook(unittest.TestCase):

    def test_spans_to_mask(self):
        with open(DEV_MASKS_FILE) as file:
            dev_masks = json.load(file)
        spans_to_mask = dev_masks["001-83927"]
        spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
        expected_spans_to_mask = [(2920, 2935), (343, 354), (3010, 3027), (2218, 2236), (254, 269), (1561, 1569), (516, 531), (2422, 2432), (803, 807), (2073, 2085), (841, 861), (1100, 1112), (292, 307), (1212, 1228), (1100, 1206), (3215, 3229), (1468, 1480), (1561, 1586), (3076, 3089), (379, 387), (867, 882), (1118, 1205), (2202, 2213), (3094, 3112), (2179, 2196), (54, 62)]

        #does equality, does not care about order
        self.assertCountEqual(spans_to_mask, expected_spans_to_mask)

    def test_tokenizer(self):
        doc_text = "PROCEDURE\n\nThe case originated in an application (no. 40593/04) against the Republic of Turkey lodged with the Court under Article 34 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”) by a Turkish national, Mr Cengiz Polat (“the applicant”), on 15 October 2004.\n\nThe applicant was represented by Mr E. Kanar, a lawyer practising in Istanbul. The Turkish Government (“the Government”) did not designate an Agent for the purposes of the proceedings before the Court.\n\nOn 6 November 2006 the Court decided to give notice of the application to the Government. Applying Article 29 § 3 of the Convention, it decided to rule on the admissibility and merits of the application at the same time.\n\nTHE FACTS\n\nTHE CIRCUMSTANCES OF THE CASE\n\nThe applicant was born in 1965. He is currently detained in the Edirne F-type Prison.\n\nOn 6 February 1993 the applicant was arrested and placed in police custody by officers from the Anti-terror branch of the Istanbul Security Directorate on suspicion of involvement in the activities of an illegal armed organisation, the TKP/ML-TIKKO (the Turkish Communist Party/Marxist Leninist -Turkish Workers and Peasants' Liberation Army).\n\nOn 15 February 1993 the applicant was brought before the public prosecutor and then the investigating judge at the Istanbul State Security Court. On the same day the investigating judge remanded the applicant in custody pending trial.\n\nBy an indictment dated 5 April 1993, the public prosecutor initiated criminal proceedings against the applicant and nineteen other defendants before the Istanbul State Security Court, accusing them, inter alia, of membership of an illegal armed organisation and of involvement in activities which undermined the constitutional order of the State. The prosecution sought the death penalty under Article 146 § 1 of the Criminal Code.\n\nIn the course of the proceedings, the State Security Court rejected the applicant's requests for release, taking into account the nature of the alleged offence and the state of the evidence.\n\nOn 12 June 2000 the applicant was convicted as charged by the Istanbul State Security Court and sentenced to life imprisonment.\n\nOn 15 May 2001 the Court of Cassation quashed the applicant's conviction for procedural reasons. The case was remitted to the Istanbul State Security Court for further examination and the applicant remained in custody.\n\nOn 7 May 2004 State Security Courts were abolished following a constitutional amendment and the applicant's case was transmitted to the Istanbul Assize Court. In the course of the proceedings, the domestic courts rejected the applicant's requests for release, taking into account the nature of the alleged offence and the documents in the case file. The applicant challenged these decisions under Article 298 of the Criminal Procedure Code; however, the domestic courts rejected all his requests.\n\nOn 31 January 2005 the Istanbul Assize Court found the applicant guilty and sentenced him to life imprisonment under Article 146 § 1 of the Criminal Code.\n\nOn 20 March 2006 the Court of Cassation quashed the decision of the Assize Court and the case file was remitted to Istanbul Assize Court.\n\nOn 4 October 2006 the applicant was released pending trial. According to the information in the case file, as submitted by the parties, the case is still pending before the Istanbul Assize Court."
        tok_tensor = tokenizer(doc_text, return_tensors="tf", padding=True, return_offsets_mapping=True)
        tokens = tok_tensor["input_ids"].numpy()[0]
        offsets = tok_tensor["offset_mapping"].numpy()[0]
        tokens_expected = np.array([    0,  4454,  4571,  1691, 12435, 50118, 50118,   133,   403, 19575,    11,    41,  2502,    36,  2362,     4,   843, 39785,    73,  3387,    43,   136,     5,  3497,     9,  2769, 15434,    19,     5,   837,   223,  6776,  2631,     9,     5,  9127,    13,     5,  5922,     9,  3861,  3941,     8, 37898, 28000,  6806,    36,    17,    48,   627,  9127,    17,    46,    43,    30,    10,  4423,   632,     6,   427,   230,  3314,  1210,  6189,   415,    36,    17,    48,   627, 20321,    17,    46,   238,    15,   379,   779,  4482,     4, 50118, 50118,   133, 20321,    21,  4625,    30,   427,   381,     4,  7542,   271,     6,    10,  2470, 21886,  3009,    11, 12275,     4,    20,  4423,  1621,    36,    17,    48,   627,  1621,    17,    46,    43,   222,    45, 31815,    41, 18497,    13,     5,  6216,     9,     5,  7069,   137,     5,   837,     4, 50118, 50118,  4148,   231,   759,  3503,     5,   837,  1276,     7,   492,  3120,     9,     5,  2502,     7,     5,  1621,     4,  3166, 13010,  6776,  1132, 39207,   155,     9,     5,  9127,     6,    24,  1276,     7,  2178,    15,     5,  2329, 17745, 12203,     8, 20273,     9,     5,  2502,    23,     5,   276,    86,     4, 50118, 50118, 13354, 26207,  2685, 50118, 50118, 13354,   230, 44160,  5725,  4014,  9363,  1723,  3243,  1941, 39743, 50118, 50118,   133, 20321,    21,  2421,    11, 18202,     4,    91,    16,   855,  5624,    11,     5,  2344,   853,   858,   274,    12, 12528, 15591,     4, 50118, 50118,  4148,   231,   902,  9095,     5, 20321,    21,  1128,     8,  2325,    11,   249,  3469,    30,  1024,    31,     5,  9511,    12, 26213,  6084,     9,     5, 12275,  2010, 23691,    15,  8551,     9,  5292,    11,     5,  1713,     9,    41,  2439,  3234,  6010,     6,     5,   255,   530,   510,    73, 10537,    12, 13216, 26228,   673,    36,   627,  4423, 12416,  1643,    73, 44849,   661, 37632,   661,   111, 37300, 10586,     8,  4119,   281,  3277,   108, 22499,  2938,   322, 50118, 50118,  4148,   379,   902,  9095,     5, 20321,    21,  1146,   137,     5,   285,  5644,     8,   172,     5,  3219,  1679,    23,     5, 12275,   331,  2010,   837,     4,   374,     5,   276,   183,     5,  3219,  1679,  6398, 13833,     5, 20321,    11,  3469,  5319,  1500,     4, 50118, 50118,  2765,    41,  9645,  7000,   195,   587,  9095,     6,     5,   285,  5644,  9608,  1837,  7069,   136,     5, 20321,     8, 40126,    97,  9483,   137,     5, 12275,   331,  2010,   837,     6,  8601,   106,     6,  3222,  1076,   493,     6,     9,  6332,     9,    41,  2439,  3234,  6010,     8,     9,  5292,    11,  1713,    61, 21167,     5,  6100,   645,     9,     5,   331,     4,    20,  6914,  2952,     5,   744,  2861,   223,  6776, 24543, 39207,   112,     9,     5, 10203,  8302,     4, 50118, 50118,  1121,     5,   768,     9,     5,  7069,     6,     5,   331,  2010,   837,  3946,     5, 20321,    18,  5034,    13,   800,     6,   602,    88,  1316,     5,  2574,     9,     5,  1697,  8637,     8,     5,   194,     9,     5,  1283,     4, 50118, 50118,  4148,   316,   502,  3788,     5, 20321,    21,  3828,    25,  1340,    30,     5, 12275,   331,  2010,   837,     8,  4018,     7,   301, 14804,     4, 50118, 50118,  4148,   379,   392,  5155,     5,   837,     9, 11710,  1258,  2677,  9512,     5, 20321,    18,  6866,    13, 24126,  2188,     4,    20,   403,    21,  6398, 16430,     7,     5, 12275,   331,  2010,   837,    13,   617,  9027,     8,     5, 20321,  2442,    11,  3469,     4, 50118, 50118,  4148,   262,   392,  4482,   331,  2010, 25627,    58, 33110,   511,    10,  6100,  8322,     8,     5, 20321,    18,   403,    21, 20579,     7,     5, 12275,  6331,  2072,   837,     4,    96,     5,   768,     9,     5,  7069,     6,     5,  1897,  4354,  3946,     5, 20321,    18,  5034,    13,   800,     6,   602,    88,  1316,     5,  2574,     9,     5,  1697,  8637,     8,     5,  2339,    11,     5,   403,  2870,     4,    20, 20321,  6835,   209,  2390,   223,  6776, 37353,     9,     5, 10203, 40209,  8302,   131,   959,     6,     5,  1897,  4354,  3946,    70,    39,  5034,     4, 50118, 50118,  4148,  1105,   644,  4013,     5, 12275,  6331,  2072,   837,   303,     5, 20321,  2181,     8,  4018,   123,     7,   301, 14804,   223,  6776, 24543, 39207,   112,     9,     5, 10203,  8302,     4, 50118, 50118,  4148,   291,   494,  3503,     5,   837,     9, 11710,  1258,  2677,  9512,     5,   568,     9,     5,  6331,  2072,   837,     8,     5,   403,  2870,    21,  6398, 16430,     7, 12275,  6331,  2072,   837,     4, 50118, 50118,  4148,   204,   779,  3503,     5, 20321,    21,   703,  5319,  1500,     4,   767,     7,     5,   335,    11,     5,   403,  2870,     6,    25,  4813,    30,     5,  1799,     6,     5,   403,    16,   202,  5319,   137,     5, 12275,  6331,  2072,   837,     4,     2])
    
        #self.assertEqual(tokens, tokens_expected, "Tokenizer is producing different results than expected.")
        self.assertTrue((tokens==tokens_expected).all())
    
    def test_label_tokens(self):
        doc_text = "PROCEDURE\n\nThe case originated in an application (no. 40593/04) against the Republic of Turkey lodged with the Court under Article 34 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”) by a Turkish national, Mr Cengiz Polat (“the applicant”), on 15 October 2004.\n\nThe applicant was represented by Mr E. Kanar, a lawyer practising in Istanbul. The Turkish Government (“the Government”) did not designate an Agent for the purposes of the proceedings before the Court.\n\nOn 6 November 2006 the Court decided to give notice of the application to the Government. Applying Article 29 § 3 of the Convention, it decided to rule on the admissibility and merits of the application at the same time.\n\nTHE FACTS\n\nTHE CIRCUMSTANCES OF THE CASE\n\nThe applicant was born in 1965. He is currently detained in the Edirne F-type Prison.\n\nOn 6 February 1993 the applicant was arrested and placed in police custody by officers from the Anti-terror branch of the Istanbul Security Directorate on suspicion of involvement in the activities of an illegal armed organisation, the TKP/ML-TIKKO (the Turkish Communist Party/Marxist Leninist -Turkish Workers and Peasants' Liberation Army).\n\nOn 15 February 1993 the applicant was brought before the public prosecutor and then the investigating judge at the Istanbul State Security Court. On the same day the investigating judge remanded the applicant in custody pending trial.\n\nBy an indictment dated 5 April 1993, the public prosecutor initiated criminal proceedings against the applicant and nineteen other defendants before the Istanbul State Security Court, accusing them, inter alia, of membership of an illegal armed organisation and of involvement in activities which undermined the constitutional order of the State. The prosecution sought the death penalty under Article 146 § 1 of the Criminal Code.\n\nIn the course of the proceedings, the State Security Court rejected the applicant's requests for release, taking into account the nature of the alleged offence and the state of the evidence.\n\nOn 12 June 2000 the applicant was convicted as charged by the Istanbul State Security Court and sentenced to life imprisonment.\n\nOn 15 May 2001 the Court of Cassation quashed the applicant's conviction for procedural reasons. The case was remitted to the Istanbul State Security Court for further examination and the applicant remained in custody.\n\nOn 7 May 2004 State Security Courts were abolished following a constitutional amendment and the applicant's case was transmitted to the Istanbul Assize Court. In the course of the proceedings, the domestic courts rejected the applicant's requests for release, taking into account the nature of the alleged offence and the documents in the case file. The applicant challenged these decisions under Article 298 of the Criminal Procedure Code; however, the domestic courts rejected all his requests.\n\nOn 31 January 2005 the Istanbul Assize Court found the applicant guilty and sentenced him to life imprisonment under Article 146 § 1 of the Criminal Code.\n\nOn 20 March 2006 the Court of Cassation quashed the decision of the Assize Court and the case file was remitted to Istanbul Assize Court.\n\nOn 4 October 2006 the applicant was released pending trial. According to the information in the case file, as submitted by the parties, the case is still pending before the Istanbul Assize Court."
        tok_tensor = tokenizer(doc_text, return_tensors="tf", padding=True, return_offsets_mapping=True)
        tokens = tok_tensor["input_ids"].numpy()[0]
        offsets = tok_tensor["offset_mapping"].numpy()[0]
        spans_to_mask = [(2920, 2935), (343, 354), (3010, 3027), (2218, 2236), (254, 269), (1561, 1569), (516, 531), (2422, 2432), (803, 807), (2073, 2085), (841, 861), (1100, 1112), (292, 307), (1212, 1228), (1100, 1206), (3215, 3229), (1468, 1480), (1561, 1586), (3076, 3089), (379, 387), (867, 882), (1118, 1205), (2202, 2213), (3094, 3112), (2179, 2196), (54, 62)]
    
        labels = label_tokens(tokens, offsets, spans_to_mask)
        expected_labels = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        self.assertTrue((labels==expected_labels).all())

  
unittest.main(argv=[''], verbosity=2, exit=False)

test_label_tokens (__main__.TestNotebook) ... ok
test_spans_to_mask (__main__.TestNotebook) ... ok
test_tokenizer (__main__.TestNotebook) ... ok

----------------------------------------------------------------------
Ran 3 tests in 0.026s

OK


<unittest.main.TestProgram at 0x7fb1886bb850>

In [6]:
def test_masked_tokens():
    doc_text = "PROCEDURE\n\nThe case originated in an application (no. 40593/04) against the Republic of Turkey lodged with the Court under Article 34 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”) by a Turkish national, Mr Cengiz Polat (“the applicant”), on 15 October 2004.\n\nThe applicant was represented by Mr E. Kanar, a lawyer practising in Istanbul. The Turkish Government (“the Government”) did not designate an Agent for the purposes of the proceedings before the Court.\n\nOn 6 November 2006 the Court decided to give notice of the application to the Government. Applying Article 29 § 3 of the Convention, it decided to rule on the admissibility and merits of the application at the same time.\n\nTHE FACTS\n\nTHE CIRCUMSTANCES OF THE CASE\n\nThe applicant was born in 1965. He is currently detained in the Edirne F-type Prison.\n\nOn 6 February 1993 the applicant was arrested and placed in police custody by officers from the Anti-terror branch of the Istanbul Security Directorate on suspicion of involvement in the activities of an illegal armed organisation, the TKP/ML-TIKKO (the Turkish Communist Party/Marxist Leninist -Turkish Workers and Peasants' Liberation Army).\n\nOn 15 February 1993 the applicant was brought before the public prosecutor and then the investigating judge at the Istanbul State Security Court. On the same day the investigating judge remanded the applicant in custody pending trial.\n\nBy an indictment dated 5 April 1993, the public prosecutor initiated criminal proceedings against the applicant and nineteen other defendants before the Istanbul State Security Court, accusing them, inter alia, of membership of an illegal armed organisation and of involvement in activities which undermined the constitutional order of the State. The prosecution sought the death penalty under Article 146 § 1 of the Criminal Code.\n\nIn the course of the proceedings, the State Security Court rejected the applicant's requests for release, taking into account the nature of the alleged offence and the state of the evidence.\n\nOn 12 June 2000 the applicant was convicted as charged by the Istanbul State Security Court and sentenced to life imprisonment.\n\nOn 15 May 2001 the Court of Cassation quashed the applicant's conviction for procedural reasons. The case was remitted to the Istanbul State Security Court for further examination and the applicant remained in custody.\n\nOn 7 May 2004 State Security Courts were abolished following a constitutional amendment and the applicant's case was transmitted to the Istanbul Assize Court. In the course of the proceedings, the domestic courts rejected the applicant's requests for release, taking into account the nature of the alleged offence and the documents in the case file. The applicant challenged these decisions under Article 298 of the Criminal Procedure Code; however, the domestic courts rejected all his requests.\n\nOn 31 January 2005 the Istanbul Assize Court found the applicant guilty and sentenced him to life imprisonment under Article 146 § 1 of the Criminal Code.\n\nOn 20 March 2006 the Court of Cassation quashed the decision of the Assize Court and the case file was remitted to Istanbul Assize Court.\n\nOn 4 October 2006 the applicant was released pending trial. According to the information in the case file, as submitted by the parties, the case is still pending before the Istanbul Assize Court."
    tok_tensor = tokenizer(doc_text, return_tensors="tf", padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    spans_to_mask = [(2920, 2935), (343, 354), (3010, 3027), (2218, 2236), (254, 269), (1561, 1569), (516, 531), (2422, 2432), (803, 807), (2073, 2085), (841, 861), (1100, 1112), (292, 307), (1212, 1228), (1100, 1206), (3215, 3229), (1468, 1480), (1561, 1586), (3076, 3089), (379, 387), (867, 882), (1118, 1205), (2202, 2213), (3094, 3112), (2179, 2196), (54, 62)]
    
    labels = label_tokens(tokens, offsets, spans_to_mask)
    
    masked_doc_text = []
    for token, offset, label in zip(tokens, offsets, labels):
        if label == 1:
            #masked_doc_text.append("[MASK]")
            str="*" + doc_text[offset[0]:offset[1]] +"*"        
            masked_doc_text.append(str)
        else:
            masked_doc_text.append(doc_text[offset[0]:offset[1]])
    print(masked_doc_text)
    masked_doc_text_joined = ''.join(x for x in masked_doc_text)
    print(masked_doc_text_joined)

test_masked_tokens()

['', 'PR', 'OC', 'ED', 'URE', '\n', '\n', 'The', 'case', 'originated', 'in', 'an', 'application', '(', 'no', '.', '*40*', '*593*', '*/*', '*04*', ')', 'against', 'the', 'Republic', 'of', 'Turkey', 'lodged', 'with', 'the', 'Court', 'under', 'Article', '34', 'of', 'the', 'Convention', 'for', 'the', 'Protection', 'of', 'Human', 'Rights', 'and', 'Fundamental', 'Freed', 'oms', '(', '“', '“', 'the', 'Convention', '”', '”', ')', 'by', 'a', 'Turkish', 'national', ',', '*Mr*', '*C*', '*eng*', '*iz*', '*Pol*', '*at*', '(', '“', '“', 'the', 'applicant', '”', '”', '),', 'on', '*15*', '*October*', '*2004*', '.', '\n', '\n', 'The', 'applicant', 'was', 'represented', 'by', '*Mr*', '*E*', '*.*', '*Kan*', '*ar*', ',', 'a', 'lawyer', 'pract', 'ising', 'in', '*Istanbul*', '.', 'The', 'Turkish', 'Government', '(', '“', '“', 'the', 'Government', '”', '”', ')', 'did', 'not', 'designate', 'an', 'Agent', 'for', 'the', 'purposes', 'of', 'the', 'proceedings', 'before', 'the', 'Court', '.', '\n', '\n', 'On', '*6

## Create Labels and Tokenize Input

In [26]:
# Create labels

dev_text = []
dev_labels = []

for i in range(len(dev_file)):
    doc_id = dev_file[i]["doc_id"]
    spans_to_mask = dev_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = dev_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    dev_text.append(doc_text)
    dev_labels.append(label_tokens(tokens, offsets, spans_to_mask))
    
train_text = []
train_labels = []

for i in range(len(train_file)):
    doc_id = train_file[i]["doc_id"]
    spans_to_mask = train_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = train_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    
    train_text.append(doc_text)
    train_labels.append(label_tokens(tokens, offsets, spans_to_mask))

test_text = []
test_labels = []

for i in range(len(test_file)):
    doc_id = test_file[i]["doc_id"]
    spans_to_mask = test_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = test_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    
    test_text.append(doc_text)
    test_labels.append(label_tokens(tokens, offsets, spans_to_mask))

In [27]:
# Pad labels to max length

MAX_LEN = 4096

for i in range(len(dev_labels)):
    curr_len = len(dev_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        dev_labels[i].extend(to_add)
        
for i in range(len(train_labels)):
    curr_len = len(train_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        train_labels[i].extend(to_add)
        
for i in range(len(test_labels)):
    curr_len = len(test_labels[i])
    
    if curr_len < 2594:  # Max sequence length in test set
        to_add = [0] * (2594 - curr_len)
        test_labels[i].extend(to_add)
        
dev_labels = np.asarray(dev_labels)
train_labels = np.asarray(train_labels)
test_labels = np.asarray(test_labels)

In [28]:
# Tokenize input

dev_text_tokenized = tokenizer(dev_text, truncation=True, padding=True, return_tensors="tf")
train_text_tokenized = tokenizer(train_text, truncation=True, padding=True, return_tensors="tf")
test_text_tokenized = tokenizer(test_text, truncation=True, padding=True, return_tensors="tf")

## Training

In [9]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00002),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [10]:
history = model.fit(train_text_tokenized["input_ids"],
                    train_labels,
                    batch_size=1,
                    epochs=2)

Epoch 1/2


2022-11-03 00:21:28.570963: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: tf_longformer_for_token_classification/longformer/encoder/layer_._9/attention/self/cond_2/branch_executed/_1009


Epoch 2/2


In [17]:
# Save model

model.save_pretrained("../models/longformer_baseline.h5", save_format="tf")

In [19]:
print(history.history)

{'loss': [0.04859026148915291, 0.03456227853894234], 'accuracy': [0.9817197918891907, 0.9862558841705322]}


# Assess Performance

In [8]:
# Load the fine-tuned model

model = TFLongformerForTokenClassification.from_pretrained("../models/longformer_baseline.h5")

Some layers from the model checkpoint at ../models/longformer_baseline.h5 were not used when initializing TFLongformerForTokenClassification: ['dropout_49']
- This IS expected if you are initializing TFLongformerForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFLongformerForTokenClassification were initialized from the model checkpoint at ../models/longformer_baseline.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerForTokenClassification for predictions without further training.


In [5]:
model.summary()

Model: "tf_longformer_for_token_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 longformer (TFLongformerMai  multiple                 148068864 
 nLayer)                                                         
                                                                 
 dropout_99 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 148,070,402
Trainable params: 148,070,402
Non-trainable params: 0
_________________________________________________________________


## Test Set

In [31]:
preds = model(test_text_tokenized["input_ids"], labels=test_labels)

In [32]:
print (preds.loss)

tf.Tensor([0.07335881], shape=(1,), dtype=float32)


In [33]:
logits = preds.logits
predicted_token_class_ids = tf.math.argmax(logits, axis=-1)

In [34]:
predicted_token_class_ids

<tf.Tensor: shape=(127, 2594), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>

In [35]:
# Save predicted_token_class_ids to avoid running inference again

np.savetxt("../predictions/longformer_preds_tab_test_set.txt", predicted_token_class_ids)

## Wikipedia
### Prepare Data

In [6]:
# Load wiki data

with open("../data/raw/wiki-summaries/annotated_wikipedia.json") as file:
    wiki_file = json.load(file)

with open("../data/processed/wiki_masks.json") as file:
    wiki_masks = json.load(file)

In [7]:
# Create labels

wiki_text = []
wiki_labels = []

for i in range(len(wiki_file)):
    doc_id = wiki_file[i]["doc_id"]
    spans_to_mask = wiki_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = wiki_file[i]["text"]
    tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    wiki_text.append(doc_text)
    wiki_labels.append(label_tokens(tokens, offsets, spans_to_mask))

In [8]:
# Pad labels to max length

MAX_LEN = 730 # Longest sequence in wiki dataset

for i in range(len(wiki_labels)):
    curr_len = len(wiki_labels[i])
    
    if curr_len < MAX_LEN:
        to_add = [0] * (MAX_LEN - curr_len)
        wiki_labels[i].extend(to_add)
        
wiki_labels = np.asarray(wiki_labels)

In [9]:
# Tokenize input

wiki_text_tokenized = tokenizer(wiki_text, truncation=True, padding=True, return_tensors="tf")

### Generate Predictions

In [11]:
preds = model(wiki_text_tokenized["input_ids"], labels=wiki_labels)

In [13]:
print (preds.loss)

tf.Tensor([0.05058768], shape=(1,), dtype=float32)


In [14]:
logits = preds.logits
predicted_token_class_ids = tf.math.argmax(logits, axis=-1)

In [15]:
predicted_token_class_ids

<tf.Tensor: shape=(553, 730), dtype=int64, numpy=
array([[0, 1, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>

In [20]:
# Save predicted_token_class_ids to avoid running inference again

np.savetxt("../predictions/longformer_preds.txt", predicted_token_class_ids)

## Calculate Precision, Recall, and AUC

In [36]:
def calc_precision(pred_list, label_list):
    """Calculates precision of batch of predictions"""
    
    tp = 0
    fp = 0
    
    for i in range(len(pred_list)):
        for j in range(len(pred_list[i])):
        
            if pred_list[i][j] == 1:
                if label_list[i][j] == 1:
                    tp += 1
                else:
                    fp += 1
            else:
                continue
                
    return tp / (tp + fp)

In [37]:
def calc_recall(pred_list, label_list):
    """Calculates recall of batch of predictions"""
    
    tp = 0 
    fn = 0
    
    for i in range(len(pred_list)):
        for j in range(len(pred_list[i])):
            
            if pred_list[i][j] == 1:
                if label_list[i][j] == 1:
                    tp += 1
                else:
                    tp += 0
                
            else:
                if label_list[i][j] == 1:
                    fn += 1
                else:
                    fn += 0
    
    return tp / (tp + fn)

## Test Set - Metrics

In [38]:
precision = calc_precision(predicted_token_class_ids, test_labels)
print (f' Token level precision: {precision}')

 Token level precision: 0.9657204374185506


In [39]:
recall = calc_recall(predicted_token_class_ids, test_labels)
print (f' Token level recall: {recall}')

 Token level recall: 0.7202806068545831


In [48]:
# Multilabel indicators are not supported in sklearn for AUC
# Loop through preds and take avg of AUC

auc = []

for i in range(len(predicted_token_class_ids)):
    
    fpr, tpr, thresholds = metrics.roc_curve(test_labels[i], predicted_token_class_ids[i], pos_label=1)
    auc.append(metrics.auc(fpr, tpr))

auc = sum(auc)/len(auc)
print (f' Average AUC: {auc}')

 Average AUC: 0.8745893287470833


## Wikipedia - Metrics

In [53]:
precision = calc_precision(predicted_token_class_ids, wiki_labels)
print (f' Token level precision: {precision}')

 Token level precision: 0.9302261898241962


In [57]:
recall = calc_recall(predicted_token_class_ids, wiki_labels)
print (f' Token level recall: {recall}')

 Token level recall: 0.6816631215635911


In [52]:
# Multilabel indicators are not supported in sklearn for AUC
# Loop through preds and take avg of AUC

auc = []

for i in range(len(predicted_token_class_ids)):
    
    fpr, tpr, thresholds = metrics.roc_curve(wiki_labels[i], predicted_token_class_ids[i], pos_label=1)
    auc.append(metrics.auc(fpr, tpr))

auc = sum(auc)/len(auc)
print (f' Average AUC: {auc}')

 Average AUC: 0.8524968985862243
