In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
%load_ext line_profiler

In [None]:
# %load_ext memory_profiler


In [19]:
import os
import sys
import random
import pickle
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

from eval.evaluate import predict

from nets import deepEM
from loader.prepData import prepdata
from loader.prepNN import prep4nn
from loader.prepNN.prep4nn import torch_data_2_network
from utils import utils
from torch.profiler import profile, record_function, ProfilerActivity


In [3]:
def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])

    return size / (1024*1024)


In [17]:

def read_test_data(test_data, params):

    test = prep4nn.data2network(test_data, 'predict', params)

    if len(test) == 0:
        raise ValueError("Test set empty.")

    #VERY slow
    nntest_data = prep4nn.torch_data_2_network(
        cdata2network=test, params=params, do_get_nn_data=True)

    te_data_size = len(nntest_data['nn_data']['ids'])

    test_data_ids = TensorDataset(torch.arange(te_data_size))

    test_sampler = SequentialSampler(test_data_ids)
    test_dataloader = DataLoader(
        test_data_ids, sampler=test_sampler, batch_size=params['batchsize'])
    return nntest_data, test_dataloader


In [18]:

# inp_args = utils._parsing_jupyter()
config_path = '/home/julio/repos/event_finder/DeepEventMine_fork/experiments/pubmed100/configs/predict-pubmed-100.yaml'

# set config path manually
# config_path = 'configs/debug.yaml'

with open(config_path, 'r') as stream:
    pred_params = utils._ordered_load(stream)

# Fix seed for reproducibility
os.environ["PYTHONHASHSEED"] = str(pred_params['seed'])
random.seed(pred_params['seed'])
np.random.seed(pred_params['seed'])
torch.manual_seed(pred_params['seed'])

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load pre-trained parameters
with open(pred_params['saved_params'], "rb") as f:
    parameters = pickle.load(f)

parameters['predict'] = True

# Set predict settings value for params
parameters['gpu'] = pred_params['gpu']
parameters['batchsize'] = pred_params['batchsize']
# print('GPU available:', torch.cuda.is_available())
if parameters['gpu'] >= 0:
    device = torch.device(
        "cuda:" + str(parameters['gpu']) if torch.cuda.is_available() else "cpu")
    # torch.cuda.set_device(parameters['gpu'])
else:
    device = torch.device("cpu")
parameters['device'] = device

# Set evaluation settings
parameters['test_data'] = pred_params['test_data']

parameters['bert_model'] = pred_params['bert_model']

result_dir = pred_params['result_dir']
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

parameters['result_dir'] = pred_params['result_dir']

# raw text
parameters['raw_text'] = pred_params['raw_text']
parameters['ner_predict_all'] = pred_params['raw_text']
parameters['a2_entities'] = pred_params['a2_entities']
parameters['json_file'] = pred_params['json_file']

# process data
test_data = prepdata.prep_input_data(
    pred_params['test_data'], parameters, json_file=parameters['json_file'])
# nntest_data, test_dataloader = read_test_data(test_data, parameters)
test = prep4nn.data2network(test_data, 'predict', parameters)

if len(test) == 0:
    raise ValueError("Test set empty.")


2022-11-24 09:17:07.924 | INFO     | bert.tokenization:from_pretrained:171 - loading vocabulary file data/bert/scibert_scivocab_cased/vocab.txt


	Words found in train: 10876
	Words found in pre-trained only: 0
	Words not found anywhere: 2083


In [20]:

#VERY slow
# nntest_data = prep4nn.torch_data_2_network(
#     cdata2network=test, params=parameters, do_get_nn_data=True)
%lprun -f torch_data_2_network torch_data_2_network(cdata2network=test, params=parameters, do_get_nn_data=True)


2022-11-24 09:19:33.159 | INFO     | bert.tokenization:from_pretrained:171 - loading vocabulary file data/bert/scibert_scivocab_cased/vocab.txt


Timer unit: 1e-09 s

Total time: 14.4106 s
File: /home/julio/repos/event_finder/DeepEventMine_fork/loader/prepNN/prep4nn.py
Function: torch_data_2_network at line 69

Line #      Hits         Time  Per Hit   % Time  Line Contents
    69                                           def torch_data_2_network(cdata2network, params, do_get_nn_data):
    70                                               """ Convert object-type data to torch.tensor type data, aim to use with Pytorch
    71                                               """
    72         1     295199.0 295199.0      0.0      etypes = [data['etypes2'] for data in cdata2network]
    73                                           
    74                                               # nner
    75         1      97029.0  97029.0      0.0      entitiess = [data['entities'] for data in cdata2network]
    76         1     119470.0 119470.0      0.0      sw_sentences = [data['sw_sentence'] for data in cdata2network]
    77         1      64

In [None]:

te_data_size = len(nntest_data['nn_data']['ids'])

test_data_ids = TensorDataset(torch.arange(te_data_size))

test_sampler = SequentialSampler(test_data_ids)
test_dataloader = DataLoader(
    test_data_ids, sampler=test_sampler, batch_size=parameters['batchsize'])


In [None]:
# %lprun -f datastuff datastuff()

In [8]:


# from predict_no_files import read_test_data as read_test_data2
# %mprun -f read_test_data2 read_test_data2(test_data, parameters)

# %mprun -f read_test_data2 read_test_data2(test_data, parameters)


In [16]:
# %lprun -f read_test_data read_test_data(test_data, parameters)
# nntest_data, test_dataloader = read_test_data(test_data, parameters)


2022-11-24 09:12:24.902 | INFO     | bert.tokenization:from_pretrained:171 - loading vocabulary file data/bert/scibert_scivocab_cased/vocab.txt


	Words found in train: 10876
	Words found in pre-trained only: 0
	Words not found anywhere: 2083


2022-11-24 09:12:25.903 | INFO     | bert.tokenization:from_pretrained:171 - loading vocabulary file data/bert/scibert_scivocab_cased/vocab.txt


Timer unit: 1e-09 s

Total time: 15.5915 s
File: /tmp/ipykernel_31242/2669338896.py
Function: read_test_data at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def read_test_data(test_data, params):
     2                                           
     3         1 1015019737.0 1015019737.0      6.5      test = prep4nn.data2network(test_data, 'predict', params)
     4                                           
     5         1       1950.0   1950.0      0.0      if len(test) == 0:
     6                                                   raise ValueError("Test set empty.")
     7                                           
     8         1 14576032129.0 14576032129.0     93.5      test_data = prep4nn.torch_data_2_network(
     9         1        260.0    260.0      0.0          cdata2network=test, params=params, do_get_nn_data=True)
    10                                           
    11         1       8430.0   843

In [None]:

# model
deepee_model = deepEM.DeepEM(parameters)

model_path = pred_params['model_path']

# Load all models
utils.handle_checkpoints(model=deepee_model,
                            checkpoint_dir=model_path,
                            params={
                                'device': device
                            },
                            resume=True)

deepee_model.to(device)

# with profile(activities=[
#         ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True,with_stack=True) as prof:
#     with record_function("model_inference"):


In [5]:
%load_ext line_profiler


In [8]:
%lprun -f predict predict(model=deepee_model,result_dir=result_dir, eval_dataloader=test_dataloader,eval_data=nntest_data,g_entity_ids_=test_data['g_entity_ids_'],params=parameters)


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

NER LOOP: --- 0.0008637905120849609 seconds ---
NER LAYER: --- 0.09203028678894043 seconds ---
REL LAYER: --- 0.01859426498413086 seconds ---
EV LAYER: --- 0.04557323455810547 seconds ---
ALL FOWARD LAYER: --- 0.32856082916259766 seconds ---


Iteration:  20%|██        | 1/5 [00:01<00:04,  1.24s/it]

PREDICT LOOP: --- 1.2396204471588135 seconds ---
NER LOOP: --- 0.0009527206420898438 seconds ---
NER LAYER: --- 0.10408425331115723 seconds ---
REL LAYER: --- 0.022671937942504883 seconds ---
EV LAYER: --- 0.14000391960144043 seconds ---
ALL FOWARD LAYER: --- 0.552802562713623 seconds ---


Iteration:  40%|████      | 2/5 [00:03<00:04,  1.60s/it]

PREDICT LOOP: --- 1.8513519763946533 seconds ---
NER LOOP: --- 0.0010538101196289062 seconds ---
NER LAYER: --- 0.10121560096740723 seconds ---
REL LAYER: --- 0.019904613494873047 seconds ---
EV LAYER: --- 0.11380362510681152 seconds ---
ALL FOWARD LAYER: --- 0.5128979682922363 seconds ---


Iteration:  60%|██████    | 3/5 [00:04<00:03,  1.63s/it]

PREDICT LOOP: --- 1.6618950366973877 seconds ---
NER LOOP: --- 0.0009343624114990234 seconds ---
NER LAYER: --- 0.09099411964416504 seconds ---
REL LAYER: --- 0.019932270050048828 seconds ---
EV LAYER: --- 0.09959864616394043 seconds ---
ALL FOWARD LAYER: --- 0.46517395973205566 seconds ---


                                                        

PREDICT LOOP: --- 1.5450270175933838 seconds ---
NER LOOP: --- 0.00026345252990722656 seconds ---
NER LAYER: --- 0.014961957931518555 seconds ---
REL LAYER: --- 0.006133317947387695 seconds ---
EV LAYER: --- 0.006146669387817383 seconds ---
ALL FOWARD LAYER: --- 0.04220175743103027 seconds ---
PREDICT LOOP: --- 0.0813448429107666 seconds ---




(FILE writing): --- 7.6365134716033936 seconds ---
Timer unit: 1e-09 s

Total time: 13.7251 s
File: /home/julio/repos/event_finder/DeepEventMine_fork/eval/evaluate.py
Function: predict at line 9

Line #      Hits         Time  Per Hit   % Time  Line Contents
     9                                           def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params):
    10         1       3880.0   3880.0      0.0      mapping_id_tag = params['mappings']['nn_mapping']['id_tag_mapping']
    11                                           
    12                                               # store predicted entities
    13         1        760.0    760.0      0.0      ent_preds = []
    14                                           
    15                                               # store predicted events
    16         1        400.0    400.0      0.0      ev_preds = []
    17                                           
    18         1       1000.0   1000.0      0

In [25]:
%lprun -f predict predict(model=deepee_model, result_dir=result_dir, eval_dataloader=test_dataloader, eval_data=nntest_data, g_entity_ids_=test_data['g_entity_ids_'], params=parameters)


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

NER LOOP: --- 4.088160753250122 seconds ---
NER LAYER: --- 4.168576955795288 seconds ---
REL LAYER: --- 0.01534891128540039 seconds ---
EV LAYER: --- 0.04376649856567383 seconds ---
ALL FOWARD LAYER: --- 4.417277812957764 seconds ---


Iteration:  20%|██        | 1/5 [00:07<00:28,  7.06s/it]

PREDICT LOOP: --- 7.059613227844238 seconds ---
NER LOOP: --- 6.67397665977478 seconds ---
NER LAYER: --- 6.782450199127197 seconds ---
REL LAYER: --- 0.020621299743652344 seconds ---
EV LAYER: --- 0.1400163173675537 seconds ---
ALL FOWARD LAYER: --- 7.2468602657318115 seconds ---


Iteration:  40%|████      | 2/5 [00:18<00:28,  9.61s/it]

PREDICT LOOP: --- 18.449894428253174 seconds ---
NER LOOP: --- 6.6860737800598145 seconds ---
NER LAYER: --- 6.782872915267944 seconds ---
REL LAYER: --- 0.018086910247802734 seconds ---
EV LAYER: --- 0.11701488494873047 seconds ---
ALL FOWARD LAYER: --- 7.209285259246826 seconds ---


Iteration:  60%|██████    | 3/5 [00:29<00:20, 10.33s/it]

PREDICT LOOP: --- 29.643948793411255 seconds ---
NER LOOP: --- 5.759660959243774 seconds ---
NER LAYER: --- 5.8500213623046875 seconds ---
REL LAYER: --- 0.017380952835083008 seconds ---
EV LAYER: --- 0.09950733184814453 seconds ---
ALL FOWARD LAYER: --- 6.2324864864349365 seconds ---


Iteration:  80%|████████  | 4/5 [00:39<00:10, 10.12s/it]

PREDICT LOOP: --- 39.43154978752136 seconds ---


                                                        

NER LOOP: --- 0.2331676483154297 seconds ---
NER LAYER: --- 0.24660420417785645 seconds ---
REL LAYER: --- 0.01309823989868164 seconds ---
EV LAYER: --- 0.009708166122436523 seconds ---
ALL FOWARD LAYER: --- 0.2849557399749756 seconds ---
PREDICT LOOP: --- 39.852705001831055 seconds ---




(FILE writing): --- 7.950889587402344 seconds ---
Timer unit: 1e-09 s

Total time: 47.3856 s
File: /home/julio/repos/event_finder/DeepEventMine_fork/eval/evaluate.py
Function: predict at line 9

Line #      Hits         Time  Per Hit   % Time  Line Contents
     9                                           def predict(model, result_dir, eval_dataloader, eval_data, g_entity_ids_, params):
    10         1       2630.0   2630.0      0.0      mapping_id_tag = params['mappings']['nn_mapping']['id_tag_mapping']
    11                                           
    12                                               # store predicted entities
    13         1        220.0    220.0      0.0      ent_preds = []
    14                                           
    15                                               # store predicted events
    16         1        170.0    170.0      0.0      ev_preds = []
    17                                           
    18         1        430.0    430.0      0.

In [1]:
import json

In [2]:
name = '/home/julio/repos/event_finder/data/pubmed/pubmed.json'

with open(name,'r') as f:
    pub = json.load(f)
    


In [4]:
get_size(pub)

49509365820

In [5]:
size = 49509365820
size/(1024*1024)
#~47 gb

47215.81060409546

In [9]:
backdict = {}
for i,(k,v) in enumerate(pub.items()):
    backdict[k] = v
    if i == 1000:
        break

In [11]:
import json
with open('/home/julio/repos/event_finder/data/pubmed/pubmed1000.json','w') as file:
    json.dump(backdict,file)

