In [1]:
import random
import os
import json
import pandas as pd
import numpy as np
from collections import defaultdict
from transformers import AutoTokenizer, AutoConfig, AutoProcessor
from transformers import RobertaConfig, RobertaTokenizer, RobertaForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import AutoModel, TrainingArguments, Trainer
# from transformers import RobertaTokenizer, RobertaModel

random.seed(42)
reshuffle_docs = False
percent_to_test = .1
percent_to_dev = .2
percent_to_train =  1 - percent_to_dev - percent_to_test

model_name = 'allenai/biomed_roberta_base'
dropout = .03

annot_types = {'Quantity', 'MeasuredEntity', 'MeasuredProperty', 'Qualifier'}

In [2]:
currentdir = os.getcwd() # ~/MeasEval/baselines

combopath_txt = os.path.join(currentdir, "../data/raw/combo/text/")
combopath_annot = os.path.join(currentdir, "../data/raw/combo/tsv/")

interimpath = os.path.join(currentdir, "../data/interim/")

docIds = []

if reshuffle_docs == True:
    combo_txt = {}
    for fn in os.listdir(combopath_txt):
        docIds.append(fn[:-4])
        path = combopath_txt+fn
        with open(path) as textfile:
                text = textfile.read()
                #[:-4] strips off the .txt to get the id
                combo_txt[fn[:-4]] = text

if reshuffle_docs == True:
    combo_annot = pd.DataFrame()
    for fn in os.listdir(combopath_annot):
        path = combopath_annot+fn
        file = pd.read_csv(path,delimiter='\t',encoding='utf-8')
        combo_annot = pd.concat([combo_annot, file],ignore_index=True)

    random.shuffle(docIds)

    n_doc = len(docIds)
    split_train = int(np.round(n_doc * percent_to_train))
    split_dev = split_train + int(np.round(n_doc * percent_to_dev))

    docs_train = docIds[:split_train]
    docs_dev = docIds[split_train:split_dev]
    docs_test = docIds[split_dev:]

    train_annot = combo_annot.loc[combo_annot['docId'].isin(docs_train)]
    dev_annot = combo_annot.loc[combo_annot['docId'].isin(docs_dev)]
    test_annot = combo_annot.loc[combo_annot['docId'].isin(docs_test)]

    # save data
    train_annot.to_csv(interimpath+'train_annot.csv')
    dev_annot.to_csv(interimpath+'dev_annot.csv')
    test_annot.to_csv(interimpath+'test_annot.csv')

    train_txt = {d: combo_txt[d] for d in docs_train}
    dev_txt = {d: combo_txt[d] for d in docs_dev}
    test_txt = {d: combo_txt[d] for d in docs_test}
    
    with open(interimpath+'train_txt.json','w') as f:
        json.dump(train_txt, f)
    with open(interimpath+'dev_txt.json','w') as f:
        json.dump(dev_txt, f)
    with open(interimpath+'test_txt.json','w') as f:
        json.dump(test_txt, f)

else:
    train_annot = pd.read_csv(interimpath+'train_annot.csv')
    dev_annot = pd.read_csv(interimpath+'dev_annot.csv')
    test_annot = pd.read_csv(interimpath+'test_annot.csv')

    with open(interimpath+'train_txt.json','r') as f:
        train_txt = json.load(f)
    with open(interimpath+'dev_txt.json','r') as f:
        dev_txt = json.load(f)
    with open(interimpath+'test_txt.json','r') as f:
        test_txt = json.load(f)




In [3]:
config = AutoConfig.from_pretrained(model_name)

In [4]:
# tokenizer = AutoTokenizer.from_pretrained(model_name, padding='max_length', max_length=512, truncation=True, return_tensors="pt"))
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.all_special_ids
# Create a dict listing all the special tokens and their ids
# special_tokens_ref = 

[0, 2, 3, 1, 50264]

In [5]:
special_ids = tokenizer.all_special_ids
tokenizer.decode(special_ids)

'<s></s><unk><pad><mask>'

In [6]:
#todo EDA

ex_type = 'QUANT'
ex_num = 0
ex = train_annot.loc[train_annot['annotType'] == 'Quantity'].loc[ex_num]
ex_doc = ex['docId']

ex_txt = train_txt[ex_doc]
# print(ex_txt)


In [7]:

# function to align annotation labels with tokens

def align_labels(type, encoded_txt, annotation, tokenizer):
    from collections import defaultdict

    label_ids = np.full(len(encoded_txt['input_ids']),0)
    special_ids = tokenizer.all_special_ids

    annot_spans = np.array(annotation[['startOffset','endOffset']])

    for token_idx, token in list(enumerate(encoded_txt['input_ids'])):
        decoded_token = tokenizer.decode(token)
        # print(f"token index: {token_idx}")
        # print(f"decoded token: {decoded_token}")

        if token in special_ids:
            label_ids[token_idx] = -100
            # print('special token')

        else:
            token_start_char = encoded_txt.token_to_chars(token_idx).start
            token_end_char = encoded_txt.token_to_chars(token_idx).end
            # print(f"token span: {[token_start_char,token_end_char]}")
            for start, end in annot_spans:
                if start <= token_start_char <= end:
                    label_ids[token_idx] = 1
                    # print(f'{type} entity found spanning {[start,end]}')
                    break
                else:
                    label_ids[token_idx] = -100
                    # print("no entity found")
        print()

    return list(label_ids)

# Example

ex_doc = 'S0012821X12004384-1610'

type = 'Quantity'

annotation = train_annot.loc[train_annot['docId'] == ex_doc]

encoded_txt = tokenizer(train_txt[ex_doc])

encoded_txt['labels']=align_labels(type, encoded_txt, annotation, tokenizer)

encoded_txt





















































































































































































{'input_ids': [0, 133, 4315, 4996, 11, 83, 13771, 1630, 179, 4031, 6, 83, 3765, 8, 614, 6641, 16838, 5917, 1116, 462, 1580, 890, 877, 19258, 5019, 36, 17425, 1245, 9891, 43, 23, 973, 1360, 4, 306, 475, 36, 44105, 4, 262, 43, 6364, 10, 33122, 3238, 9, 4084, 514, 21862, 700, 3509, 73, 242, 1182, 21130, 14086, 137, 5, 230, 7720, 6, 61, 16, 275, 2002, 30, 41, 712, 11, 2174, 13273, 528, 7, 63, 6379, 2574, 4, 83, 24634, 8009, 4878, 11, 38, 4, 25224, 25299, 2764, 1594, 268, 8711, 678, 20771, 9, 3027, 8095, 11534, 3544, 31, 5681, 36, 7048, 7162, 204, 4, 245, 322, 660, 3059, 4878, 11, 43662, 20024, 1558, 347, 189, 33, 57, 1726, 30, 30789, 5000, 9, 5, 369, 3939, 31, 41, 9094, 12193, 1975, 1902, 6, 33634, 316, 347, 12, 225, 46129, 6523, 4363, 23, 5581, 4, 152, 5665, 189, 67, 3922, 5, 97, 21414, 11, 83, 13771, 1630, 179, 4031, 23, 973, 1646, 4, 401, 8, 973, 1570, 4, 406, 475, 36, 24648, 192, 7162, 204, 4, 134, 322, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [21]:
ex_doc = 'S0012821X12004384-1610'

type = 'Quantity'

annotation = train_annot.loc[train_annot['docId'] == ex_doc]

encoded_txt = tokenizer(train_txt[ex_doc])

align_labels(type, encoded_txt, annotation, tokenizer)

# print(align_labels(type, encoded_txt, annotation, tokenizer))

token index: 0
decoded token: <s>
special token

token index: 1
decoded token: The
token span: [0, 3]
no entity found
no entity found
no entity found
no entity found
no entity found
no entity found

token index: 2
decoded token:  brief
token span: [4, 9]
no entity found
Quantity entity found spanning [4, 14]

token index: 3
decoded token:  peak
token span: [10, 14]
no entity found
Quantity entity found spanning [4, 14]

token index: 4
decoded token:  in
token span: [15, 17]
no entity found
no entity found
Quantity entity found spanning [15, 30]

token index: 5
decoded token:  A
token span: [18, 19]
no entity found
no entity found
Quantity entity found spanning [15, 30]

token index: 6
decoded token: pect
token span: [19, 23]
no entity found
no entity found
Quantity entity found spanning [15, 30]

token index: 7
decoded token: od
token span: [23, 25]
no entity found
no entity found
Quantity entity found spanning [15, 30]

token index: 8
decoded token: in
token span: [25, 27]
no entity f

array([-100, -100,    1,    1,    1,    1,    1,    1,    1,    1,    1,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100,    1,    1,
          1,    1,    1, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100,    1,   

In [None]:

print(f"|{ex_txt[lab_start:lab_end]}|")
encoded_label = tokenizer.encode(ex_txt[lab_start:lab_end])
print(f"|{encoded_label}|")
decoded_label = tokenizer.decode(encoded_label)
print(f"|{decoded_label}|")

char2tok_start = lab_start
char2tok_end = lab_end + 1 #tokenizer.char_to_token needs to start one char to the right

encoded_txt = tokenizer(ex_txt)

print(encoded_txt['input_ids'])

labels_to_ids = { 'QUANT': 1 }

lab_start_token_idx = encoded_txt.char_to_token(char2tok_start)
lab_end_token_idx = encoded_txt.char_to_token(char2tok_end)

lab_start_token = encoded_txt['input_ids'][lab_start_token_idx]
print(lab_start_token)
print(tokenizer.decode(lab_start_token))

lab_end_token = encoded_txt['input_ids'][lab_end_token_idx]
print(lab_end_token)
print(tokenizer.decode(lab_end_token))

label_as_tokens = encoded_txt['input_ids'][lab_start_token_idx:lab_end_token_idx]
print(f"|{tokenizer.decode(label_as_tokens)}|")


# print(list(enumerate(encoded_txt['input_ids'])))


special_ids = tokenizer.all_special_ids
label_ids = []
for token_idx, token in list(enumerate(encoded_txt['input_ids'])):
    if token in special_ids:
        label_ids.append(-100)
    else:
        decoded_token = tokenizer.decode(token)
        token_start_char = encoded_txt.token_to_chars(token_idx).start
        print(f"token index: {token_idx} | token id: {token} token: {decoded_token} | token start char: {token_start_char}")
        
        



    # Special tokens have a word id that is None. We set the label to -100 
    # so they are automatically ignored in the loss function.
    


print(label_ids)
# previous_word_idx = None
# label_ids = []
# for word_idx in encoded_txt.word_ids():
#     if word_idx is None:
#         label_ids.append(-100)
#     elif word_idx != previous_word_idx:
#         try:
#             label_ids.append(labels_to_ids[labels[word_idx]])
#         except:
#             label_ids.append(-100)
#     else:
#         label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
#     previous_word_idx = word_idx

In [138]:
token_charspan = encoded_txt.token_to_chars(7)
token_charspan.start

22

In [31]:
model = RobertaForSequenceClassification.from_pretrained(model_name)

model.

Some weights of the model checkpoint at allenai/biomed_roberta_base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allenai/biomed_roberta_base and are newly initialized: ['classi

In [27]:
ex_ent = train_txt[ex_doc][ex['startOffset']:ex['endOffset']]

print(ex_ent)

ent_token_ids = tokenizer(ex_ent)['input_ids']

print(ent_token_ids)

decoded_ent = tokenizer.decode(ent_token_ids)

print(decoded_ent)

2617.4 m
[0, 2481, 1360, 4, 306, 475, 2]
<s>2617.4 m</s>


RobertaConfig {
  "_name_or_path": "allenai/biomed_roberta_base",
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

NameError: name 'exampletxt' is not defined

In [None]:
# todo: build models

config = AutoConfig.from_pretrained(model_name)
config

In [12]:

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# processor = AutoProcessor.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)

print(config)


RobertaConfig {
  "_name_or_path": "allenai/biomed_roberta_base",
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



In [13]:

model = AutoModel.from_config(config)

model.

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop