In [1]:
!pip install gdown
!mkdir -p ./data/raw
!mkdir -p ./data/processed
!gdown 1pv7dFLniLEMJXXk5-YL3_kWxpOs7ueJt -O ./data.zip
!unzip ./data.zip -d ./

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading...
From: https://drive.google.com/uc?id=1pv7dFLniLEMJXXk5-YL3_kWxpOs7ueJt
To: /content/data.zip
100% 9.35M/9.35M [00:00<00:00, 214MB/s]
Archive:  ./data.zip
  inflating: ./data/processed/wiki_masks.json  
  inflating: ./data/processed/jg_dev_masks.json  
  inflating: ./data/processed/echr_train_masks.json  
  inflating: ./data/processed/echr_dev_masks.json  
  inflating: ./data/processed/echr_test_masks.json  
  inflating: ./data/processed/jg_test_masks.json  
  inflating: ./data/processed/jg_train_masks.json  
   creating: ./data/raw/text-anonymization-benchmark/
  inflating: ./data/raw/text-anonymization-benchmark/README.md  
  inflating: ./data/raw/text-anonymization-benchmark/LICENSE.txt  
  inflating: ./data/raw/text-anonymization-benchmark/echr_dev.json  
  inflating: ./data/raw/text-anonymization-benchmark/echr_test.json  
  inflating: ./data/raw/text-anonymization-benc

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Nov 21 06:50:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    45W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install tokenizers
!pip install transformers
!pip install transformers[sentencepiece]
!pip install torch
!pip install json
!pip install numpy
!pip install sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 6.1 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 1.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 84.7 MB/s 
Installing collected packages: huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple

In [4]:
import tokenizers
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import DebertaV2TokenizerFast

from tqdm.auto import tqdm

import sentencepiece
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import json
import numpy as np
from tqdm import tqdm
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

VERSION = 'v2.BCE'
#max token input length
MAX_LEN = 768
TRAIN_BATCH_SIZE = 6#32
VALID_BATCH_SIZE = 4#8
EPOCHS =6 #10
BASE_MODEL = 'microsoft/deberta-v3-large'
#BASE_MODEL = 'microsoft/deberta-v3-small'

MODEL_PATH = "model_" + BASE_MODEL.replace('/','_') + "_" + VERSION + ".bin"
#TOKENIZER = AutoTokenizer.from_pretrained(BASE_MODEL)
TOKENIZER = DebertaV2TokenizerFast.from_pretrained(BASE_MODEL)

DEV_FILE = "./data/raw/text-anonymization-benchmark/echr_dev.json"
TRAINING_FILE = "./data/raw/text-anonymization-benchmark/echr_train.json"
TEST_FILE = "./data/raw/text-anonymization-benchmark/echr_test.json"

DEV_MASKS_FILE =    "./data/processed/jg_dev_masks.json"
TRAIN_MASKS_FILE =  "./data/processed/jg_train_masks.json"
TEST_MASKS_FILE =   "./data/processed/jg_test_masks.json"

class EntityDataset:
    def __init__(self, texts, ids, labels, offsets, masks):
        # ids: [[    0,  4454,  4571,  1691, 12435, 50118, ..., 4, 2], [0, 50118,   133,   403, 19575,    11,    41, ..., 2]]
        # texts - original texts
        # offsets - mapping of tokens to text 
        # labels: is token an identifier: [[0,0,0,0,1,1,0,0, ...,], [0,0,1,0,0,...]]
        self.texts = texts
        self.ids = ids
        self.offsets = offsets
        self.labels = labels
        self.masks = masks
    
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, item):
        ids = self.ids[item]
        masks = self.masks[item]
        
        target_labels =self.labels[item]
        #pad if we need to
        if len(target_labels) < MAX_LEN:
            target_labels = np.pad(target_labels, (0,MAX_LEN-target_labels.size),'constant', constant_values=(0))
        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "masks": torch.tensor(masks, dtype=torch.long),
            "labels": torch.tensor(target_labels, dtype=torch.float32),
        }
    #for debugging
    def printItem(i):
        masked_doc_text=''
        for token, offset, label in zip(tokens, offsets, labels):
            if label == 1:
                #masked_doc_text.append("[MASK]")
                str="*" + texts[offset[0]:offset[1]] +"*"        
                masked_doc_text.append(str)
            else:
                masked_doc_text.append(texts[offset[0]:offset[1]])
        print(masked_doc_text)

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)

def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        _, loss = model(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)

def loss_fn(out_logits, target, mask):
    lfn = nn.BCELoss()    
    #for BCE
    m = nn.Sigmoid()    
    loss = lfn(m(out_logits), target)
    return loss

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, labels, attention_mask):
        #last_hidden_state.shape = [|b|,768,1024].  For deberta-small: [|b|,768,768]
        #attention_mask = [[1,1,1,1... 1,0,0,0]].  attention_mask.shape = [1,768].  For deberta-small = [768, 32]
        #desired output shape: [|b|, 768]
        #unsqueeze to add 1 dim, then duplicate in dimension to match the second hidden state dim (768)
        #tt_mask = attention_mask.unsqueeze(-1)
        #input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape[1]).float()  
        #sum the hidden state tensor along the 2 dimension (1024)
        
        #for small:
        sum_embeddings = torch.sum(last_hidden_state, 2)        
        sum_embeddings = torch.mul(sum_embeddings, attention_mask)        
        mean_embeddings = sum_embeddings / last_hidden_state.shape[2]
        return mean_embeddings

class EntityModel(nn.Module):
    def __init__(self):
        super(EntityModel, self).__init__()
        #full config
        #https://huggingface.co/docs/transformers/model_doc/deberta-v2
        self.config = AutoConfig.from_pretrained(BASE_MODEL, return_dict=True)
        self.m = AutoModel.from_pretrained(BASE_MODEL, config=self.config) 
        self.mpool = MeanPooling()
    
    def forward(self, ids, masks, labels):
        output = self.m(ids, attention_mask=masks)
        mpool = self.mpool(output.last_hidden_state, labels, masks)
        loss_labels = loss_fn(mpool, labels, masks)        
        return labels, loss_labels

# Function used to label data
def label_tokens(toks, offs, spans_to_mask):
    """Args: 
            toks - list of token id's
            offs - list of char offsets for each token
       Returns:
            label_list - 0 for non_mask, 1 for mask"""
    
    label_list = []
    mapping_list = []    
    # Map token_ids back to string    
    for token, pos in zip(toks, offs):
        mapping_list.append([token, pos[0], pos[1]])
    
    # Determine if each token should be masked
    spans_to_mask.sort(key=lambda tup: tup[0]) #order spans, ascending
    
    j=0
    for i in range(len(mapping_list)):
        temp_list = []
        stop=False        
        while not stop and j < len(spans_to_mask):            
            if (mapping_list[i][1] >= spans_to_mask[j][0]) and (mapping_list[i][2] <= spans_to_mask[j][1]):
                temp_list.append(1)
            else:
                temp_list.append(0)           

            # Since spans and mapping_list are ordered, break to allow it to catch up
            if(spans_to_mask[j][1] > mapping_list[i][2]):
                stop=True
            else:
                j = j+1
            
        if sum(temp_list) >= 1:
            label_list.append(1)
        else:
            label_list.append(0)
    return label_list  

def process_data(data_path, masks_path):
    with open(data_path) as file:
        file = json.load(file)

    with open(masks_path) as masks_file:
        train_masks = json.load(masks_file)

    text = []
    tokens = []
    offsets = []
    labels = []
    masks = []

    for i in range(len(file)):
        doc_id = file[i]["doc_id"]
        spans_to_mask = train_masks[doc_id]
        spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
        doc_text = file[i]["text"]
        tok_tensor = TOKENIZER.encode_plus(
            doc_text,
            add_special_tokens=True,            
            max_length=MAX_LEN,
            truncation=True,                #Truncate at MAX_LEN for now.  Can try setting MAX_LEN to the longest text.
            padding='max_length',
            return_tensors='pt',            #pytorch tensors
            return_offsets_mapping = True
        )
        
        #TOKENIZER(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
        doc_tokens = tok_tensor["input_ids"].numpy()[0]
        doc_offsets = tok_tensor["offset_mapping"].numpy()[0]
        masks_ = tok_tensor["attention_mask"].numpy()[0]
       
        labels.append(label_tokens(doc_tokens, doc_offsets, spans_to_mask))
        masks.append(masks_)
        tokens.append(doc_tokens)
        offsets.append(doc_offsets)
        text.append(doc_text)
  
    return text, tokens, labels, offsets, masks


if __name__ == "__main__":
    
    testPredictRun=True

    if testPredictRun:
        single_doc = """
        The case originated in an application (no. 40593/04) against the Republic of Turkey lodged with the Court under Article 34 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”) by a Turkish national, Mr Cengiz Polat (“the applicant”), on 15 October 2004.
        """
        tok_tensor = TOKENIZER.encode_plus(
            single_doc,
            add_special_tokens=True,            
            max_length=MAX_LEN,
            truncation=True,                #Truncate at MAX_LEN for now.  Can try setting MAX_LEN to the longest text.
            padding='max_length',
            return_tensors='pt',
            return_offsets_mapping = True
        )
        ids = tok_tensor["input_ids"].numpy()[0]
        #ids = tok_tensor.input_ids.flatten()
        #mask = tok_tensor.attention_mask

        test_dataset = EntityDataset(
            texts=[single_doc], 
            ids=[ids],        
            labels = [np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
            offsets = [tok_tensor["offset_mapping"].numpy()[0]],
            masks = [tok_tensor["attention_mask"].numpy()[0]]
        )

        device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = EntityModel()
        #model.load_state_dict(torch.load(MODEL_PATH))
        #model.load_state_dict(torch.load(model))
        model.to(device)

        with torch.no_grad():
            data = test_dataset[0]
            for k,v in data.items():
                data[k] = v.to(device).unsqueeze(0)
            #ids_test = data.get('ids')
            #ids_size = ids_test.size()
            
            _, loss = model(**data)
            #TODO: work out the loss function and evaluation here
            #labels, loss_labels = model(**data)

    texts, tokens, labels, offsets, masks = process_data(TRAINING_FILE, TRAIN_MASKS_FILE)

    #Split train into train and test.  0.9/0.1 split
    (
        train_texts,
        test_texts,
        train_tokens,
        test_tokens,
        train_labels,
        test_labels,
        train_offsets,
        test_offsets,
        train_masks,
        test_masks
    ) = model_selection.train_test_split(texts, tokens, labels, offsets, masks, random_state=42, test_size=0.1)

    train_dataset = EntityDataset(
        texts=train_texts, ids=train_tokens, labels=train_labels, offsets=train_offsets, masks=train_masks
    )
    test_dataset = EntityDataset(
        texts=test_texts, ids=test_tokens, labels=test_labels, offsets=test_offsets, masks=test_masks
    )

    texts, tokens, labels, offsets, masks = process_data(DEV_FILE, DEV_MASKS_FILE)
    dev_dataset = EntityDataset(
        texts=texts, ids=tokens, labels=labels, offsets=offsets, masks=masks
    )
    dev_data_loader = torch.utils.data.DataLoader(
        dev_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4
    )

    texts, tokens, labels, offsets, masks = process_data(TEST_FILE, TEST_MASKS_FILE)
    valid_dataset = EntityDataset(
        texts=texts, ids=tokens, labels=labels, offsets=offsets, masks=masks
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
    )

    device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = EntityModel()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_texts) / TRAIN_BATCH_SIZE * EPOCHS)
    #num_train_steps = int(len(dev_dataset.texts)) / TRAIN_BATCH_SIZE * EPOCHS
    optimizer = AdamW(optimizer_parameters, lr=3e-4)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    best_loss = np.inf
    for epoch in range(EPOCHS):
        train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
        test_loss = eval_fn(valid_data_loader, model, device)
        print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
        if test_loss < best_loss:
            torch.save(model.state_dict(), MODEL_PATH)
            best_loss = test_loss

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

Train Loss = 0.6605746185309008 Valid Loss = 0.6514885742217302


100%|██████████| 152/152 [02:46<00:00,  1.10s/it]
100%|██████████| 32/32 [00:08<00:00,  3.63it/s]


Train Loss = 0.6371891839723838 Valid Loss = 0.6327233463525772


100%|██████████| 152/152 [02:46<00:00,  1.10s/it]
100%|██████████| 32/32 [00:08<00:00,  3.64it/s]


Train Loss = 0.6188475485695036 Valid Loss = 0.6184502393007278


100%|██████████| 152/152 [02:46<00:00,  1.10s/it]
100%|██████████| 32/32 [00:08<00:00,  3.57it/s]


Train Loss = 0.6055392732745722 Valid Loss = 0.6086983643472195


100%|██████████| 152/152 [02:46<00:00,  1.10s/it]
100%|██████████| 32/32 [00:08<00:00,  3.66it/s]


Train Loss = 0.5969837649088157 Valid Loss = 0.6030509918928146


100%|██████████| 152/152 [02:46<00:00,  1.10s/it]
100%|██████████| 32/32 [00:08<00:00,  3.64it/s]


Train Loss = 0.5928035903918115 Valid Loss = 0.6011965349316597


In [5]:
import gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)



In [6]:
EVAL_DATA_NAME = "WIKI_SUMMARIES"

In [7]:
MAX_LEN_W = 768 # 730 is Longest sequence in wiki dataset
# Load wiki data

with open("./data/raw/wiki-summaries/annotated_wikipedia.json") as file:
    wiki_file = json.load(file)

with open("./data/processed/wiki_masks.json") as file:
    wiki_masks = json.load(file)

In [8]:
# Create labels

wiki_text = []
wiki_labels = []

for i in range(len(wiki_file)):
    doc_id = wiki_file[i]["doc_id"]
    spans_to_mask = wiki_masks[doc_id]
    spans_to_mask = list({tuple(x) for x in spans_to_mask}) # Make spans unique
    doc_text = wiki_file[i]["text"]
    #tok_tensor = tokenizer(doc_text, return_tensors="tf", truncation=True, padding=True, return_offsets_mapping=True)
    tok_tensor = TOKENIZER.encode_plus(
            doc_text,
            add_special_tokens=True,            
            max_length=MAX_LEN_W,
            truncation=True,                #Truncate at MAX_LEN for now.  Can try setting MAX_LEN to the longest text.
            padding='max_length',
            return_tensors='pt',
            return_offsets_mapping = True
        )
    tokens = tok_tensor["input_ids"].numpy()[0]
    #tokens = tok_tensor["input_ids"].numpy()[0]
    offsets = tok_tensor["offset_mapping"].numpy()[0]
    wiki_text.append(doc_text)
    wiki_labels.append(label_tokens(tokens, offsets, spans_to_mask))

In [9]:
# Pad labels to max length
def pad(arr):
  for i in range(len(arr)):
      curr_len = len(arr[i])
      
      if curr_len < MAX_LEN_W:
          to_add = [0] * (MAX_LEN_W - curr_len)
          arr[i].extend(to_add)
          
  arr = np.asarray(arr)
  return arr

In [10]:
# Tokenize input
wiki_text_tokenized = TOKENIZER(wiki_text, truncation=True, max_length=768, padding=True, return_tensors="pt")

In [11]:
device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = EntityModel()
loaded_model = loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.eval()

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


EntityModel(
  (m): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
              (drop

In [12]:
wiki_labels = pad(wiki_labels)
ids_a= wiki_text_tokenized["input_ids"]
ids_a = F.pad(ids_a, (0,MAX_LEN - ids_a.shape[-1]))
labels_a = torch.tensor(wiki_labels, dtype=torch.float32)
zero_masks = torch.zeros_like(ids_a)

In [13]:


zero_masks = zero_masks.to(device)
ids_a = ids_a.to(device)
labels_a = labels_a.to(device)
#wiki_text_tokenized["input_ids"].to(device)
#wiki_labels.to(device)
predictions=[] #np.zeros(ids_a.shape)

#m = nn.Sigmoid()
with torch.no_grad():
  batch_size=20
  for i in range(0, ids_a.shape[0], batch_size):
      num_in_batch = min(i+batch_size, ids_a.shape[0])
      indices = range(i, num_in_batch)
      ids_batch = ids_a[indices]
      pred_logits, _ = loaded_model(ids= ids_batch, labels=labels_a[indices], masks=zero_masks[indices])
      pred_logits = pred_logits.to("cpu", dtype=torch.float)
      #p_batch = (m(pred_logits))
      #res = p_batch.tolist()
      predictions.append(pred_logits.tolist()[0])
      #loss = loss_fn(m(out_logits), labels_a[indices])
      #print (loss)



In [14]:
np.savetxt("./" + MODEL_PATH + "_" + EVAL_DATA_NAME + "_preds.txt", predictions)

In [15]:
def calc_precision(pred_list, label_list):
    """Calculates precision of batch of predictions"""
    
    tp = 0
    fp = 0
    
    for i in range(len(pred_list)):
        for j in range(len(pred_list[i])):
        
            if pred_list[i][j] == 1:
                if label_list[i][j] == 1:
                    tp += 1
                else:
                    fp += 1
            else:
                continue
                
    return tp / (tp + fp)

In [16]:
def calc_recall(pred_list, label_list):
    """Calculates recall of batch of predictions"""
    
    tp = 0 
    fn = 0
    
    for i in range(len(pred_list)):
        for j in range(len(pred_list[i])):
            
            if pred_list[i][j] == 1:
                if label_list[i][j] == 1:
                    tp += 1
                else:
                    tp += 0
                
            else:
                if label_list[i][j] == 1:
                    fn += 1
                else:
                    fn += 0
    
    return tp / (tp + fn)

In [17]:
precision = calc_precision(predictions, wiki_labels)
print (f' Token level precision: {precision}')
recall = calc_recall(predictions, wiki_labels)
print (f' Token level recall: {recall}')

 Token level precision: 0.3328358208955224
 Token level recall: 0.24505494505494504


In [18]:
# Multilabel indicators are not supported in sklearn for AUC
# Loop through preds and take avg of AUC

auc = []

for i in range(len(predictions)):
    
    fpr, tpr, thresholds = metrics.roc_curve(wiki_labels[i], predictions[i], pos_label=1)
    auc.append(metrics.auc(fpr, tpr))

auc = sum(auc)/len(auc)
print (f' Average AUC: {auc}')

 Average AUC: 0.6510879379084192
