In [22]:
from prodigy.components.db import connect

def dataset_to_train_data(dataset):
    db = connect()
    examples = db.get_dataset(dataset)
    train_data = []
    for eg in examples:
        if eg['answer'] == 'accept':
            entities = [(span['start'], span['end'], span['label'])
                        for span in eg['spans']]
            train_data.append(
                (
                    eg['text'], 
                    {
                        "entities": entities,
                        "dep": "",      # dependency label
                        "head": 0,        # offset of token head relative to token index
                        "tag": "",      # part-of-speech tag
                        "orth": "",     # verbatim text of the token
                    }
                ))
    return train_data

In [23]:
dt = dataset_to_train_data("medical-signs-gold-eval")
de = dataset_to_train_data('FULL_SIGN_gold_train')

In [24]:
db = connect()

In [25]:
db.datasets

['medical_signs_relation_indicators',
 'medical_signs_disease_terms',
 'medical-signs-gold-eval',
 'classify_relevant_sentences',
 'FULL_SIGN_gold_train',
 'FULL_SIGN_binary_train',
 'gold_standard_updated_review_eval']

In [35]:
nlp = spacy.load('en_core_sci_md',disable=["ner"])

In [26]:
import json

In [28]:
json.dump(de,open("./data/annotate/spacy_gold_data_eval.json","w+"))
json.dump(de,open("./data/annotate/spacy_gold_data_train.json","w+"))

In [31]:
dt[0][0]

'Oxidative and nitrosative stress were greater in patients with sickle cell anemia compared with control patients, but the rate of vaso-occlusive crisis events in sickle cell anemia was not associated with the level of oxidative stress.'

In [37]:
import json

import spacy
from prodigy.components.db import connect
from prodigy.util import split_evals
from spacy.gold import GoldCorpus, minibatch, biluo_tags_from_offsets, tags_to_entities


def prodigy_to_spacy(nlp, dataset):
    """Create spaCy JSON training data from a Prodigy dataset.

    See https://spacy.io/api/annotation#json-input.
    """
    db = connect()
    examples = db.get_dataset(dataset)

    offsets = []
    for eg in examples:
        if eg['answer'] == 'accept':
            entities = [(span['start'], span['end'], span['label'])
                        for span in eg['spans']]
            offsets.append((eg['text'], {'entities': entities}))

    docs = docs_from_offsets(nlp, offsets)
    trees = docs_to_trees(docs)
    return trees


def docs_from_offsets(nlp, gold):
    """Create a sequence of Docs from a sequence of text, entity-offsets pairs."""
    docs = []
    for text, entities in gold:
        doc = nlp(text)
        entities = entities['entities']
        tags = biluo_tags_from_offsets(doc, entities)
        if entities:
            for start, end, label in entities:
                span = doc.char_span(start, end, label=label)
                if span:
                    doc.ents = list(doc.ents) + [span]
        if doc.ents:  # remove to return documents without entities too
            docs.append((doc, tags))
    return docs


def docs_to_trees(docs):
    """Create spaCy JSON training data from a sequence of Docs."""
    doc_trees = []
    for d, doc_tuple in enumerate(docs):
        doc, tags = doc_tuple
        try:
            tags_to_entities(tags)
        except AssertionError:
            print('Dropping {}'.format(d))
            continue
        if not tags:
            print('Dropping {}'.format(d))
            continue
        sentences = []
        for s in doc.sents:
            s_tokens = []
            for t in s:
                token_data = {
                    'id': t.i,
                    'orth': t.orth_,
                    'tag': t.tag_,
                    'head': t.head.i - t.i,
                    'dep': t.dep_,
                    'ner': tags[t.i],
                }
                s_tokens.append(token_data)
            sentences.append({'tokens': s_tokens})
        doc_trees.append({
            'id': d,
            'paragraphs': [
                {
                    'raw': doc.text,
                    'sentences': sentences,
                }
            ]
        })
    return doc_trees

dev = prodigy_to_spacy(nlp, 'medical-signs-gold-eval')
train = prodigy_to_spacy(nlp, 'FULL_SIGN_gold_train')


with open('data/annotate/train.json', 'w+') as f:
    json.dump(train, f)
with open('data/annotate/dev.json', 'w+') as f:
    json.dump(dev, f)