In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import random
import pickle
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

from eval.evaluate import predict

from nets import deepEM
from loader.prepData import prepdata
from loader.prepNN import prep4nn
from utils import utils
from torch.profiler import profile, record_function, ProfilerActivity


2022-11-21 09:01:45.478 | INFO     | bert.modeling:<module>:231 - Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [3]:

def read_test_data(test_data, params):
    test = prep4nn.data2network(test_data, 'predict', params)

    if len(test) == 0:
        raise ValueError("Test set empty.")

    test_data = prep4nn.torch_data_2_network(
        cdata2network=test, params=params, do_get_nn_data=True)
    te_data_size = len(test_data['nn_data']['ids'])

    test_data_ids = TensorDataset(torch.arange(te_data_size))
    test_sampler = SequentialSampler(test_data_ids)
    test_dataloader = DataLoader(
        test_data_ids, sampler=test_sampler, batch_size=params['batchsize'])
    return test_data, test_dataloader


In [4]:
# inp_args = utils._parsing_jupyter()
config_path = '/home/julio/repos/event_finder/DeepEventMine_fork/experiments/pubmed100/configs/predict-pubmed-100.yaml'

# set config path manually
# config_path = 'configs/debug.yaml'

with open(config_path, 'r') as stream:
    pred_params = utils._ordered_load(stream)

# Fix seed for reproducibility
os.environ["PYTHONHASHSEED"] = str(pred_params['seed'])
random.seed(pred_params['seed'])
np.random.seed(pred_params['seed'])
torch.manual_seed(pred_params['seed'])

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load pre-trained parameters
with open(pred_params['saved_params'], "rb") as f:
    parameters = pickle.load(f)

parameters['predict'] = True

# Set predict settings value for params
parameters['gpu'] = pred_params['gpu']
parameters['batchsize'] = pred_params['batchsize']
print('GPU available:', torch.cuda.is_available())
if parameters['gpu'] >= 0:
    device = torch.device(
        "cuda:" + str(parameters['gpu']) if torch.cuda.is_available() else "cpu")
    # torch.cuda.set_device(parameters['gpu'])
else:
    device = torch.device("cpu")
parameters['device'] = device

# Set evaluation settings
parameters['test_data'] = pred_params['test_data']

parameters['bert_model'] = pred_params['bert_model']

result_dir = pred_params['result_dir']
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

parameters['result_dir'] = pred_params['result_dir']

# raw text
parameters['raw_text'] = pred_params['raw_text']
parameters['ner_predict_all'] = pred_params['raw_text']
parameters['a2_entities'] = pred_params['a2_entities']
parameters['json_file'] = pred_params['json_file']

# process data
test_data = prepdata.prep_input_data(
    pred_params['test_data'], parameters, json_file=parameters['json_file'])
nntest_data, test_dataloader = read_test_data(test_data, parameters)

# model
deepee_model = deepEM.DeepEM(parameters)

model_path = pred_params['model_path']

# Load all models
utils.handle_checkpoints(model=deepee_model,
                            checkpoint_dir=model_path,
                            params={
                                'device': device
                            },
                            resume=True)

deepee_model.to(device)

# with profile(activities=[
#         ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True,with_stack=True) as prof:
#     with record_function("model_inference"):


2022-11-21 09:02:38.297 | INFO     | bert.tokenization:from_pretrained:171 - loading vocabulary file data/bert/scibert_scivocab_cased/vocab.txt


GPU available: True
	Words found in train: 10876
	Words found in pre-trained only: 0
	Words not found anywhere: 2083


2022-11-21 09:02:38.986 | INFO     | bert.tokenization:from_pretrained:171 - loading vocabulary file data/bert/scibert_scivocab_cased/vocab.txt
2022-11-21 09:02:49.155 | INFO     | bert.modeling:from_pretrained:577 - loading archive file data/bert/scibert_scivocab_cased
2022-11-21 09:02:49.158 | INFO     | bert.modeling:from_pretrained:595 - Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 31116
}

2022-11-21 09:02:51.003 | INFO     | bert.modeling:from_pretrained:645 - Weights of NestedNERModel not initialized from pretrained model: ['label_ids', 'entity_classifier.weight', 'entity_classifier.bias', 'trigger_classifier.weight', 'trigger_classifier.bias']
2022-11-21 09:02:51.004 | INFO     | bert.modeling:from_pretrained:

data/models/cg/model/20190911030703702499_deepee_base_92_59.49.pt
Loading model from checkpoint data/models/cg/model/


DeepEM(
  (NER_layer): NestedNERModel(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(31116, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): BertLayerNorm()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): BertLayerNorm()
        

In [5]:
%load_ext line_profiler


In [6]:
%lprun -f predict predict(model=deepee_model,result_dir=result_dir, eval_dataloader=test_dataloader,eval_data=nntest_data,g_entity_ids_=test_data['g_entity_ids_'],params=parameters)


  flattened_embedding_indices = torch.arange(
  flattened_sentence_indices = sentence_indices.flatten().masked_select(


NER LOOP: --- 0.37952613830566406 seconds ---
NER LAYER: --- 0.5120458602905273 seconds ---
REL LAYER: --- 0.022736072540283203 seconds ---
EV LAYER: --- 0.05231738090515137 seconds ---
ALL FOWARD LAYER: --- 0.7793147563934326 seconds ---


Iteration:  20%|██        | 1/5 [00:01<00:06,  1.73s/it]

PREDICT LOOP: --- 1.7276966571807861 seconds ---
NER LOOP: --- 0.8099792003631592 seconds ---
NER LAYER: --- 0.9124910831451416 seconds ---
REL LAYER: --- 0.022560834884643555 seconds ---
EV LAYER: --- 0.13768768310546875 seconds ---
ALL FOWARD LAYER: --- 1.3659417629241943 seconds ---


Iteration:  40%|████      | 2/5 [00:04<00:06,  2.31s/it]

PREDICT LOOP: --- 4.445741415023804 seconds ---
NER LOOP: --- 0.6077940464019775 seconds ---
NER LAYER: --- 0.6996157169342041 seconds ---
REL LAYER: --- 0.019855737686157227 seconds ---
EV LAYER: --- 0.11309576034545898 seconds ---
ALL FOWARD LAYER: --- 1.1152832508087158 seconds ---


Iteration:  60%|██████    | 3/5 [00:06<00:04,  2.32s/it]

PREDICT LOOP: --- 6.772505521774292 seconds ---
NER LOOP: --- 0.5288283824920654 seconds ---
NER LAYER: --- 0.6218419075012207 seconds ---
REL LAYER: --- 0.0190579891204834 seconds ---
EV LAYER: --- 0.09977316856384277 seconds ---
ALL FOWARD LAYER: --- 0.9954302310943604 seconds ---


                                                        

PREDICT LOOP: --- 8.90893292427063 seconds ---
NER LOOP: --- 0.021910429000854492 seconds ---
NER LAYER: --- 0.0431976318359375 seconds ---
REL LAYER: --- 0.002622365951538086 seconds ---
EV LAYER: --- 0.006018161773681641 seconds ---
ALL FOWARD LAYER: --- 0.06672978401184082 seconds ---
PREDICT LOOP: --- 9.01675033569336 seconds ---




(FILE writing): --- 7.4954822063446045 seconds ---


Timer unit: 1e-09 s

Total time: 16.1899 s
File: /home/julio/repos/event_finder/DeepEventMine_fork/eval/evaluate.py
Function: predict at line 9

Line #      Hits         Time  Per Hit   % Time  Line Contents
     9                                           def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params):
    10         1       4240.0   4240.0      0.0      mapping_id_tag = params['mappings']['nn_mapping']['id_tag_mapping']
    11                                           
    12                                               # store predicted entities
    13         1        240.0    240.0      0.0      ent_preds = []
    14                                           
    15                                               # store predicted events
    16         1        160.0    160.0      0.0      ev_preds = []
    17                                           
    18         1        550.0    550.0      0.0      fidss, wordss, offsetss, sub_to_wordss, spa

In [25]:
%lprun -f predict predict(model=deepee_model, result_dir=result_dir, eval_dataloader=test_dataloader, eval_data=nntest_data, g_entity_ids_=test_data['g_entity_ids_'], params=parameters)


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

NER LOOP: --- 4.088160753250122 seconds ---
NER LAYER: --- 4.168576955795288 seconds ---
REL LAYER: --- 0.01534891128540039 seconds ---
EV LAYER: --- 0.04376649856567383 seconds ---
ALL FOWARD LAYER: --- 4.417277812957764 seconds ---


Iteration:  20%|██        | 1/5 [00:07<00:28,  7.06s/it]

PREDICT LOOP: --- 7.059613227844238 seconds ---
NER LOOP: --- 6.67397665977478 seconds ---
NER LAYER: --- 6.782450199127197 seconds ---
REL LAYER: --- 0.020621299743652344 seconds ---
EV LAYER: --- 0.1400163173675537 seconds ---
ALL FOWARD LAYER: --- 7.2468602657318115 seconds ---


Iteration:  40%|████      | 2/5 [00:18<00:28,  9.61s/it]

PREDICT LOOP: --- 18.449894428253174 seconds ---
NER LOOP: --- 6.6860737800598145 seconds ---
NER LAYER: --- 6.782872915267944 seconds ---
REL LAYER: --- 0.018086910247802734 seconds ---
EV LAYER: --- 0.11701488494873047 seconds ---
ALL FOWARD LAYER: --- 7.209285259246826 seconds ---


Iteration:  60%|██████    | 3/5 [00:29<00:20, 10.33s/it]

PREDICT LOOP: --- 29.643948793411255 seconds ---
NER LOOP: --- 5.759660959243774 seconds ---
NER LAYER: --- 5.8500213623046875 seconds ---
REL LAYER: --- 0.017380952835083008 seconds ---
EV LAYER: --- 0.09950733184814453 seconds ---
ALL FOWARD LAYER: --- 6.2324864864349365 seconds ---


Iteration:  80%|████████  | 4/5 [00:39<00:10, 10.12s/it]

PREDICT LOOP: --- 39.43154978752136 seconds ---


                                                        

NER LOOP: --- 0.2331676483154297 seconds ---
NER LAYER: --- 0.24660420417785645 seconds ---
REL LAYER: --- 0.01309823989868164 seconds ---
EV LAYER: --- 0.009708166122436523 seconds ---
ALL FOWARD LAYER: --- 0.2849557399749756 seconds ---
PREDICT LOOP: --- 39.852705001831055 seconds ---




(FILE writing): --- 7.950889587402344 seconds ---


Timer unit: 1e-09 s

Total time: 47.3856 s
File: /home/julio/repos/event_finder/DeepEventMine_fork/eval/evaluate.py
Function: predict at line 9

Line #      Hits         Time  Per Hit   % Time  Line Contents
     9                                           def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params):
    10         1       2630.0   2630.0      0.0      mapping_id_tag = params['mappings']['nn_mapping']['id_tag_mapping']
    11                                           
    12                                               # store predicted entities
    13         1        220.0    220.0      0.0      ent_preds = []
    14                                           
    15                                               # store predicted events
    16         1        170.0    170.0      0.0      ev_preds = []
    17                                           
    18         1        430.0    430.0      0.0      fidss, wordss, offsetss, sub_to_wordss, spa

In [7]:
import pickle

In [11]:
with open('all_preds.pickle','rb') as handle:
    all_preds = pickle.load(handle)

In [18]:
all_preds[365:370]

array([[ 0,  0],
       [ 0,  0],
       [12,  0],
       [ 0,  0],
       [ 0,  0]], dtype=uint8)

In [13]:
with open('trigger_indices.pickle', 'rb') as handle:
    trigger_indices = pickle.load(handle)


In [17]:
trigger_indices[:10]

[367, 660, 881, 1500, 1573, 3960, 7851, 7852, 7865, 8768]

In [107]:
len(trigger_indices)

61

In [19]:
import torch

In [20]:
preds = torch.from_numpy(all_preds)

In [56]:
preds.shape

torch.Size([28845, 2])

In [25]:
(preds == 1).nonzero(as_tuple = False)


tensor([[20942,     0],
        [21641,     0],
        [22111,     0],
        [22441,     0]])

In [98]:
indx = torch.tensor(range(1,41))

In [99]:
indx

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39, 40])

In [100]:
# index2= torch.tensor([[0],[0]])
index0= torch.zeros([preds.shape[0],1],dtype=int)
index1= torch.ones([preds.shape[0],1],dtype=int)

In [95]:
preds.gather(1,index0)

tensor([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]], dtype=torch.uint8)

In [101]:
trigers0 = (preds.gather(1, index0) == indx).nonzero(as_tuple=False)
trigers1 = (preds.gather(1, index1) == indx).nonzero(as_tuple=False)


In [108]:
trigers1

tensor([[27172,     8]])

In [109]:
trigers0

tensor([[  367,    11],
        [  660,    22],
        [  881,    22],
        [ 1500,    11],
        [ 1573,    11],
        [ 3960,    11],
        [ 7851,     9],
        [ 7852,     8],
        [ 7865,     8],
        [ 8768,     9],
        [ 9454,     2],
        [11612,    11],
        [13615,     2],
        [16976,    11],
        [17512,    11],
        [18313,    15],
        [19750,    11],
        [19813,    13],
        [20336,    11],
        [20897,    22],
        [20942,     0],
        [20970,     2],
        [21614,     8],
        [21641,     0],
        [22111,     0],
        [22125,     8],
        [22377,     4],
        [22441,     0],
        [22460,     8],
        [22566,     4],
        [22720,     4],
        [22776,     2],
        [22957,    23],
        [23021,     8],
        [23280,     5],
        [23350,     4],
        [23448,     8],
        [23588,     4],
        [23644,     4],
        [23713,    11],
        [23791,     4],
        [23847, 

In [117]:

trigers0= trigers0.gather(1,
                torch.zeros([trigers0.shape[0], 1], dtype=int)
                ).squeeze(1).tolist()


In [118]:
trigers1 = trigers1.gather(1,
                torch.zeros([trigers1.shape[0], 1], dtype=int)
                ).squeeze(1).tolist()


In [119]:
trigers = list(set(trigers0 + trigers1))

In [120]:
trigers

[27172,
 18313,
 21641,
 25870,
 26255,
 26258,
 660,
 23448,
 24603,
 24860,
 24863,
 22776,
 20897,
 23713,
 26144,
 23588,
 1573,
 19750,
 22566,
 23847,
 22441,
 24870,
 7851,
 7852,
 22957,
 25773,
 13615,
 25392,
 26284,
 27571,
 23350,
 7865,
 22460,
 8768,
 22720,
 28839,
 20942,
 25423,
 16976,
 27599,
 28506,
 11612,
 1500,
 23644,
 22111,
 24414,
 24676,
 19813,
 17512,
 22377,
 20970,
 22125,
 9454,
 367,
 20336,
 881,
 21614,
 23021,
 23280,
 23791,
 3960]