In [1]:
import random
import os
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoProcessor
from transformers import DataCollatorWithPadding
from transformers import AutoModel, TrainingArguments, Trainer
# from transformers import RobertaTokenizer, RobertaModel

random.seed(42)
percent_to_train = .7

model_name = 'allenai/biomed_roberta_base'
dropout = .03



In [21]:
currentdir = os.getcwd() # ~/MeasEval/baselines

trainpaths = [os.path.join(currentdir, "../data/trial/tsv/"),
             os.path.join(currentdir, "../data/train/tsv/")]

evalpath = os.path.join(currentdir, "../data/eval/text/")

textpaths = [os.path.join(currentdir, "../data/trial/txt/"),
            os.path.join(currentdir, "../data/train/text/")]

# Set shorthands for annotation spans
typemap = {"Quantity": "QUANT",
           "MeasuredEntity": "ME", 
           "MeasuredProperty": "MP", 
           "Qualifier": "QUAL"}

# Collect all the ids and all the text files in both the train and trial directories
# Set our train test split for doing initial model development.
docIds = []
textset = {}
for fileset in textpaths:
    for fn in os.listdir(fileset):
        path = fileset+fn
        #print(fn)
        with open(path) as textfile:
            text = textfile.read()
            #[:-4] strips off the .txt to get the id
            textset[fn[:-4]] = text
            docIds.append(fn[:-4])

random.shuffle(docIds)

tt_split = int(np.round(len(docIds) * percent_to_train))

trainIds = docIds[:tt_split]
testIds = docIds[tt_split:]



In [18]:
print(f"first 5 train IDs: \n{trainIds[:5]}")

print(f"first 5 test IDs: \n{testIds[:5]}")

print(f"folders containing data and annotations: \n{textpaths}")


first 5 train IDs: 
['S2213671113000738-647', 'S2213671113001306-1398', 'S0925443913001385-1646', 'S1750583613004192-714', 'S0925443913001385-1429']
first 5 test IDs: 
['S0019103512004009-4350', 'S016412121300188X-5038', 'S0022459611006116-1448', 'S030881461301604X-1002', 'S016412121300188X-4937']
folders containing data and annotations: 
['/home/sam/MeasEval/baselines/../data/trial/txt/', '/home/sam/MeasEval/baselines/../data/train/text/']


In [24]:
# Build training data from TSVs in expected format for spacy NER models...
# We have to train each model separately, because spacy doesn't let us have 
# Multiple entities that overlap, and we have this a lot (Especially in our Qualifiers)
# Unfortunately, we even have a fair bit of overlap within annotation types, 
# and end up needing to throw away a bunch of training data.

# Note that we have data split for train / test, and we also have full training data.

trainents = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
traindata = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
testents = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
testdata = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}

alltrainents = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
alltraindata = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}

for fileset in trainpaths:
    for fn in os.listdir(fileset):
        entities = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
        with open(fileset+fn) as annotfile:
            text = textset[fn[:-4]]

            # Overlap handler (not required for RoBERTa??)
            next(annotfile)
            annots = annotfile.read().splitlines()
            for a in annots:
                annot = a.split("\t")
                atype = typemap[annot[2]]
                start = int(annot[3])
                stop = int(annot[4])
                overlap = False
                # This is where we toss out the overlaps:
                # for ent in entities[atype]:
                #     if ((start >= ent[0] and start <= ent[1]) or (stop >= ent[0] and stop <= ent[1]) or
                #         (ent[0] >= start and ent[0] <= stop) or (ent[1] >= start and ent[1] <= stop)):
                #         #print(str(start)+"-"+str(stop)+" overlaps " + str(ent))
                #         overlap = True
                if overlap == False:    
                    entities[atype].append((start, stop, atype))

            if fn[:-4] in trainIds:
                traindata["QUANT"].append((text, {"entities": entities["QUANT"]}))
                traindata["ME"].append((text, {"entities": entities["ME"]}))
                traindata["MP"].append((text, {"entities": entities["MP"]}))
                traindata["QUAL"].append((text, {"entities": entities["QUAL"]}))
                trainents["QUANT"].extend(entities["QUANT"])
                trainents["ME"].extend(entities["ME"])
                trainents["MP"].extend(entities["MP"])
                trainents["QUAL"].extend(entities["QUAL"])
            else:
                testdata["QUANT"].append((text, {"entities": entities["QUANT"]}))
                testdata["ME"].append((text, {"entities": entities["ME"]}))
                testdata["MP"].append((text, {"entities": entities["MP"]}))
                testdata["QUAL"].append((text, {"entities": entities["QUAL"]}))
                testents["QUANT"].extend(entities["QUANT"])
                testents["ME"].extend(entities["ME"])
                testents["MP"].extend(entities["MP"])
                testents["QUAL"].extend(entities["QUAL"])
            alltraindata["QUANT"].append((text, {"entities": entities["QUANT"]}))
            alltraindata["ME"].append((text, {"entities": entities["ME"]}))
            alltraindata["MP"].append((text, {"entities": entities["MP"]}))
            alltraindata["QUAL"].append((text, {"entities": entities["QUAL"]}))
            alltrainents["QUANT"].extend(entities["QUANT"])
            alltrainents["ME"].extend(entities["ME"])
            alltrainents["MP"].extend(entities["MP"])
            alltrainents["QUAL"].extend(entities["QUAL"])

In [61]:
ex = 5
ex_type = 'QUANT'

exampletxt = traindata[ex_type][ex][0]

exampleents = traindata[ex_type][ex][1]['entities']

print(f"example QUANT train data: \n{exampletxt}\n")

print(f"example QUANT entity from train data: \n{exampleents}\n")

for start, stop, type in exampleents:
    print(f'{start}, {stop}, {type}')
    print(f'{exampletxt[start:stop]}')

# Trainents is an ordered list of all entities of a type the the tsv
# this prints those which match the train data
tsvent = 0
for i in range(0,ex):
    tsvent += len(traindata[ex_type][i][1]['entities'])
    #print(tsvent)

print(f"\nThose same entity indexes, but from the tsv: \n{trainents['QUANT'][tsvent:tsvent+len(exampleents)]}\n")

example QUANT train data: 
Cleaned sponge and diatom opal was dissolved via wet alkaline digestion (Cardinal et al., 2007; Ragueneau et al., 2005) in 0.2 M NaOH at 100 °C for 40 min (diatoms) or up to 1 week (sponge spicules). The samples were acidified to pH∼2 with 0.2 M thermally distilled HCl and separated from major ions using cation exchange resin (BioRad AG50W-X12, Georg et al., 2006). Silicon isotope analysis was carried out using a Nu Instruments Nu-Plasma HR multi-collector inductively coupled plasma mass spectrometer run in medium resolution mode (m/Δm∼3500 at 5% and 95%). Samples were introduced via a self-aspirating PFA microconcentric nebuliser (ESI) in a Cetac Aridus II desolvating unit. Measurements included six to eight standard-sample brackets (brackets where the rate of machine drift outstripped bracketing rate were disregarded), each composed of twenty eight-second integrations. Samples were measured relative to the NIST RM 8546 standard. The external diatomite stand

In [75]:
# We don't throw out _that_ many, see counts below.
print("Training:")
trainentscount = 0
trainenttypecount = {"QUANT":0, "ME":0, "MP":0, "QUAL":0}
for t in ["QUANT", "ME", "MP", "QUAL"]:
    trainenttypecount[t]=len(trainents[t])
    print(t + ": " + str(trainenttypecount[t]))
    trainentscount+=trainenttypecount[t]
print("Total: " + str(trainentscount))

testentscount = 0
testenttypecount = {"QUANT":0, "ME":0, "MP":0, "QUAL":0}
print("\nTest:")
for t in ["QUANT", "ME", "MP", "QUAL"]:
    print(t + ": " + str(len(testents[t])))
    testenttypecount[t]=len(testents[t])
    testentscount+=len(testents[t])
print("Total: " + str(testentscount))

allentscount = 0
print("\nAll training:")
for t in ["QUANT", "ME", "MP", "QUAL"]:
    print(t + ": " + str(len(alltrainents[t])))
    allentscount+=len(alltrainents[t])
print("Total: " + str(allentscount))

print("\nTest to train rations:")
print(f"All examples : {(testentscount / trainentscount)}")
for t in ["QUANT", "ME", "MP", "QUAL"]:
    print(f"{t} : {(testenttypecount[t] / trainenttypecount[t])}")



Training:
QUANT: 830
ME: 820
MP: 537
QUAL: 238
Total: 2425

Test:
QUANT: 334
ME: 328
MP: 205
QUAL: 71
Total: 938

All training:
QUANT: 1164
ME: 1148
MP: 742
QUAL: 309
Total: 3363

Test to train rations:
All examples : 0.3868041237113402
QUANT : 0.40240963855421685
ME : 0.4
MP : 0.3817504655493482
QUAL : 0.29831932773109243


In [None]:
# todo: build models

tokenizer = AutoTokenizer.from_pretrained(model_name)

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# processor = AutoProcessor.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)

model = AutoModel.from_config(config)