In [4]:
import random
import os
import json
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoProcessor
from transformers import DataCollatorWithPadding
from transformers import AutoModel, TrainingArguments, Trainer
# from transformers import RobertaTokenizer, RobertaModel

random.seed(42)
percent_to_train = .8

model_name = 'allenai/biomed_roberta_base'
dropout = .03



In [5]:
currentdir = os.getcwd() # ~/MeasEval/baselines

trainpaths = [os.path.join(currentdir, "../data/raw/trial/tsv/"),
             os.path.join(currentdir, "../data/raw/train/tsv/")]

evalpath = os.path.join(currentdir, "../data/raw/eval/text/")

textpaths = [os.path.join(currentdir, "../data/raw/trial/txt/"),
            os.path.join(currentdir, "../data/raw/train/text/")]

# place to save interim data
interimpath = os.path.join(currentdir, "../data/interim/")

# Set shorthands for annotation spans
typemap = {"Quantity": "QUANT",
           "MeasuredEntity": "ME", 
           "MeasuredProperty": "MP", 
           "Qualifier": "QUAL"}

In [6]:

# Collect all the ids and all the text files in both the train and trial directories
# Set our train test split for doing initial model development.
docIds_path = os.path.join(interimpath, "docIds")
textset_path = os.path.join(interimpath, "textset")
trainIds_path = os.path.join(interimpath, "trainIds")
testIds_path = os.path.join(interimpath, "testIds")

# Toggle to re-split the data
do_split_raw_data = True

if do_split_raw_data:
    docIds = []
    textset = {}
    for fileset in textpaths:
        for fn in os.listdir(fileset):
            path = fileset+fn
            #print(fn)
            with open(path) as textfile:
                text = textfile.read()
                #[:-4] strips off the .txt to get the id
                textset[fn[:-4]] = text
                docIds.append(fn[:-4])
    
    print(type(textset))

    random.shuffle(docIds)

    tt_split = int(np.round(len(docIds) * percent_to_train))

    trainIds = docIds[:tt_split]
    testIds = docIds[tt_split:]

    textset_file = open(textset_path,'w')
    json.dump(textset, textset_file)
    textset_file.close()
    np.savetxt(docIds_path, docIds, fmt='%s')
    np.savetxt(trainIds_path, trainIds, fmt='%s')
    np.savetxt(testIds_path, testIds, fmt='%s')
else:
    trainIds = np.loadtxt(trainIds_path, dtype='str')
    testIds = np.loadtxt(testIds_path, dtype='str')
    docIds = np.loadtxt(docIds_path, dtype='str')

    textset_file = open(textset_path,'r')
    textset = textset_file.read()
    textset_file.close()

In [7]:
print(f"first 5 train IDs: \n{trainIds[:5]}")

print(f"first 5 test IDs: \n{testIds[:5]}")

print(f"folders containing data and annotations: \n{textpaths}")


first 5 train IDs: 
['S2213671113000738-647' 'S2213671113001306-1398' 'S0925443913001385-1646'
 'S1750583613004192-714' 'S0925443913001385-1429']
first 5 test IDs: 
['S0960148113002048-3775' 'S0167577X14001256-389' 'S0006322312001096-1230'
 'S0022399913003358-943' 'S0167739X12001525-5094']
folders containing data and annotations: 
['/home/sam/MeasEval/baselines/../data/raw/trial/txt/', '/home/sam/MeasEval/baselines/../data/raw/train/text/']


In [8]:
# Build training data from TSVs in expected format for spacy NER models...
# We have to train each model separately, because spacy doesn't let us have 
# Multiple entities that overlap, and we have this a lot (Especially in our Qualifiers)
# Unfortunately, we even have a fair bit of overlap within annotation types, 
# and end up needing to throw away a bunch of training data.

# Note that we have data split for train / test, and we also have full training data.

trainents = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
traindata = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
testents = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
testdata = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}

alltrainents = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
alltraindata = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}

for fileset in trainpaths:
    for fn in os.listdir(fileset):
        entities = {"QUANT": [], "ME": [], "MP": [], "QUAL": []}
        with open(fileset+fn) as annotfile:
            text = textset[fn[:-4]]

            # Overlap handler (not required for RoBERTa??)
            next(annotfile)
            annots = annotfile.read().splitlines()
            for a in annots:
                annot = a.split("\t")
                atype = typemap[annot[2]]
                start = int(annot[3])
                stop = int(annot[4])
                overlap = False
                # This is where we toss out the overlaps:
                # for ent in entities[atype]:
                #     if ((start >= ent[0] and start <= ent[1]) or (stop >= ent[0] and stop <= ent[1]) or
                #         (ent[0] >= start and ent[0] <= stop) or (ent[1] >= start and ent[1] <= stop)):
                #         #print(str(start)+"-"+str(stop)+" overlaps " + str(ent))
                #         overlap = True
                if overlap == False:    
                    entities[atype].append((start, stop, atype))

            if fn[:-4] in trainIds:
                traindata["QUANT"].append((text, {"entities": entities["QUANT"]}))
                traindata["ME"].append((text, {"entities": entities["ME"]}))
                traindata["MP"].append((text, {"entities": entities["MP"]}))
                traindata["QUAL"].append((text, {"entities": entities["QUAL"]}))
                trainents["QUANT"].extend(entities["QUANT"])
                trainents["ME"].extend(entities["ME"])
                trainents["MP"].extend(entities["MP"])
                trainents["QUAL"].extend(entities["QUAL"])
            else:
                testdata["QUANT"].append((text, {"entities": entities["QUANT"]}))
                testdata["ME"].append((text, {"entities": entities["ME"]}))
                testdata["MP"].append((text, {"entities": entities["MP"]}))
                testdata["QUAL"].append((text, {"entities": entities["QUAL"]}))
                testents["QUANT"].extend(entities["QUANT"])
                testents["ME"].extend(entities["ME"])
                testents["MP"].extend(entities["MP"])
                testents["QUAL"].extend(entities["QUAL"])
            alltraindata["QUANT"].append((text, {"entities": entities["QUANT"]}))
            alltraindata["ME"].append((text, {"entities": entities["ME"]}))
            alltraindata["MP"].append((text, {"entities": entities["MP"]}))
            alltraindata["QUAL"].append((text, {"entities": entities["QUAL"]}))
            alltrainents["QUANT"].extend(entities["QUANT"])
            alltrainents["ME"].extend(entities["ME"])
            alltrainents["MP"].extend(entities["MP"])
            alltrainents["QUAL"].extend(entities["QUAL"])

TypeError: string indices must be integers

In [None]:
ex = 5
ex_type = 'QUANT'

exampletxt = traindata[ex_type][ex][0]

exampleents = traindata[ex_type][ex][1]['entities']

print(f"example QUANT train data: \n{exampletxt}\n")

print(f"example QUANT entity from train data: \n{exampleents}\n")

for start, stop, type in exampleents:
    print(f'{start}, {stop}, {type}')
    print(f'{exampletxt[start:stop]}')

# Trainents is an ordered list of all entities of a type the the tsv
# this prints those which match the train data
tsvent = 0
for i in range(0,ex):
    tsvent += len(traindata[ex_type][i][1]['entities'])
    #print(tsvent)

print(f"\nThose same entity indexes, but from the tsv: \n{trainents['QUANT'][tsvent:tsvent+len(exampleents)]}\n")

example QUANT train data: 
Sediments were prepared following existing techniques (Hendry et al., 2010), which were refined slightly to better suit the microseparation technique utilised here as a convenient alternative to sieving. Initial cleaning with H2O2 and HCl was carried out to concentrate biogenic opal. From this pre-cleaned sample the >100 μm fraction was separated using microseparation (Minoletti et al., 2009) and between 50 and 100 spicules were hand-picked. A range of spicule morphotypes were included, as it has been shown that neither spicule morphology nor species composition creates any consistent offset in δ30Si (Hendry et al., 2010, 2011). Spicules were sonicated in reagent grade methanol and dried down in 200 μL of concentrated HNO3. Sponge δ30Si data presented here are generally in agreement with the data from this site produced by De la Rocha (2003), despite different methodologies and specific sampling intervals (Fig. S2). XRD analysis showed sponge spicules from th

In [None]:
# We don't throw out _that_ many, see counts below.
print("Training:")
trainentscount = 0
trainenttypecount = {"QUANT":0, "ME":0, "MP":0, "QUAL":0}
for t in ["QUANT", "ME", "MP", "QUAL"]:
    trainenttypecount[t]=len(trainents[t])
    print(t + ": " + str(trainenttypecount[t]))
    trainentscount+=trainenttypecount[t]
print("Total: " + str(trainentscount))

testentscount = 0
testenttypecount = {"QUANT":0, "ME":0, "MP":0, "QUAL":0}
print("\nTest:")
for t in ["QUANT", "ME", "MP", "QUAL"]:
    print(t + ": " + str(len(testents[t])))
    testenttypecount[t]=len(testents[t])
    testentscount+=len(testents[t])
print("Total: " + str(testentscount))

allentscount = 0
print("\nAll training:")
for t in ["QUANT", "ME", "MP", "QUAL"]:
    print(t + ": " + str(len(alltrainents[t])))
    allentscount+=len(alltrainents[t])
print("Total: " + str(allentscount))

print("\nTest to train rations:")
print(f"All examples : {(testentscount / trainentscount)}")
for t in ["QUANT", "ME", "MP", "QUAL"]:
    print(f"{t} : {(testenttypecount[t] / trainenttypecount[t])}")



Training:
QUANT: 900
ME: 884
MP: 562
QUAL: 244
Total: 2590

Test:
QUANT: 264
ME: 264
MP: 180
QUAL: 65
Total: 773

All training:
QUANT: 1164
ME: 1148
MP: 742
QUAL: 309
Total: 3363

Test to train rations:
All examples : 0.29845559845559844
QUANT : 0.29333333333333333
ME : 0.2986425339366516
MP : 0.3202846975088968
QUAL : 0.26639344262295084


In [None]:
# todo: build models

tokenizer = AutoTokenizer.from_pretrained(model_name)

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# processor = AutoProcessor.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)

model = AutoModel.from_config(config)