In [1]:
import os
import tqdm
import os.path as osp
import pandas as pd
import glob
import spacy
%run env.py
%run src/lib.py
%run src/integration.py
fix_jupyter_spacy_config()
corpus_dir = osp.join(DATA_DIR, 'articles', 'corpus', 'corpus_00')
collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_01')
corpus_docs_path = osp.join(corpus_dir, 'docs')
corpus_links_dir = osp.join(corpus_dir, 'links')

In [2]:
files = []
# Add unannotated text files and THEN annotated text files as links will overwrite
f = glob.glob(osp.join(corpus_docs_path, '*.txt'))
assert f, 'No unannotated text files found'
print('Found {} unnannotated text files'.format(len(f)))
files.extend(f)
f = glob.glob(osp.join(collection_dir, '*.txt'))
assert f, 'No annotated text files found'
print('Found {} annotated text files'.format(len(f)))
files.extend(f)
len(files)

Found 500 unnannotated text files
Found 10 annotated text files


510

In [3]:
corpus_files = []
for f in files:
    if not osp.exists(corpus_links_dir):
        os.makedirs(corpus_links_dir)
    path = osp.join(corpus_links_dir, osp.basename(f))
    if osp.exists(path):
        os.unlink(path)
    os.symlink(f, path)
    corpus_files.append(path)
print('Linked {} files in {}'.format(len(corpus_files), corpus_links_dir))

Linked 510 files in /Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/links


In [5]:
nlp = get_scispacy_pipeline()
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x11f53a4a8>),
 ('parser', <spacy.pipeline.DependencyParser at 0x11f566d58>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x12a4068e0>)]

In [10]:
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
from interlap import InterLap

if not Span.has_extension('ent_typ_id'):
    Span.set_extension('ent_typ_id', default=None)
if not Span.has_extension('ent_typ_lbl'):
    Span.set_extension('ent_typ_lbl', default=None)
    
def merge_fn_keep_longest_interval(intervals):
    """Choose longest interval from given list of (start, end, data) tuples"""
    return sorted(intervals, key=lambda v: v[1]-v[0])[-1]

class EntityMatcher(object):
    """Entity matcher with overlap resolution"""
    
    def __init__(self, nlp, terms, label):
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        # Loop through tokens and do exact lookup on normalized, known labels
        matches = self.matcher(doc)
        
        spans = IntervalMergingDict(merge_fn_keep_longest_interval)
        for match_id, start, end in matches:
            # Note that inverval overlap detection is done with inclusive right and left
            # boundaries and the end index from matching is always exclusive so for interval
            # overlap detection to work, 1 should be subtracted from right side index
            spans.add(start, end-1, Span(doc, start, end, label=match_id)) 
        # Sort results by start index and extract span objects
        spans = [s[1] for s in sorted(spans.items(), key=lambda e: e[0])]
        doc.ents = list(doc.ents) + spans
        return doc 
    
def _get_matcher_terms(values):
    return [v for s in values for v in [s.lower(), s.upper(), s.title(), s]]

class EntityTagger(object):

    def __init__(self, nlp, entity_type, label):
        df = get_entity_meta_data(entity_type)
        self.matcher = EntityMatcher(nlp, df['sym'].unique(), label)
        self.label = nlp.vocab.strings[label]
        self.entity_type = entity_type
        self.df = df.set_index('sym')

    def __call__(self, doc):
        # Attach metadata properties to matched entities
        doc = self.matcher(doc)
        for ent in doc.ents:
            if ent.label != self.label:
                continue
            #print(self.entity_type, ent.text, ent.label)
            r = self.df.loc[ent.text]
            ent._.ent_typ_id = r['id']
            ent._.ent_typ_lbl = r['lbl']
        return doc 
    
class EntityRefiner(object):
    name = 'refiner'
    
    def __init__(self, nlp):
        self.ck_tagger = EntityTagger(nlp, 'cytokines', 'CYTOKINE')
        self.tf_tagger = EntityTagger(nlp, 'transcription_factors', 'TF')
        self.ct_tagger = EntityTagger(nlp, 'cell_types', 'CELL_TYPE')
        
    def __call__(self, doc):
        # Reset entities
        doc.ents = list()
        doc = self.ck_tagger(doc)
        doc = self.tf_tagger(doc)
        doc = self.ct_tagger(doc)
        return doc 

if nlp.has_pipe(EntityRefiner.name):
    nlp.remove_pipe(EntityRefiner.name)
nlp.add_pipe(EntityRefiner(nlp), last=True)

In [11]:
#nlp.pipeline[-1][1].ck_tagger.df.loc['TGF-β']

In [12]:
# with open(osp.join(output_corpus_dir, 'PMC3925027.txt'), 'r') as fd:
#     doc = fd.read()
#doc = nlp(doc)
#spacy.displacy.render(doc, style='ent', jupyter=True)

In [13]:
from spacy.tokens import Doc
if not Doc.has_extension('pmc_id'):
    Doc.set_extension('pmc_id', default=None)
    
def get_docs():
    for f in tqdm.tqdm(corpus_files):
        with open(f, 'r') as fd:
            text = fd.read()
            try:
                doc = nlp(text)
                doc._.pmc_id = osp.basename(f).replace('.txt', '')
                yield doc
            except:
                print('Failed to process doc at {}'.format(f))
                raise
docs = [d for d in get_docs()]

100%|██████████| 510/510 [05:48<00:00,  3.21it/s]


In [19]:
# spacy.displacy.render(docs[3], style='ent', jupyter=True)

### Tag Export

In [20]:
from collections import OrderedDict
df = pd.DataFrame([
    OrderedDict(
        id=doc._.pmc_id, type=ent.label_, 
        ent_typ_id=ent._.ent_typ_id, ent_typ_lbl=ent._.ent_typ_lbl,
        start_chr=ent.start_char, end_chr=ent.end_char, 
        start_wrd=ent.start, end_wrd=ent.end, 
        text=ent.text
    ) 
    for doc in docs 
    for ent in doc.ents
])
df.head()

Unnamed: 0,id,type,ent_typ_id,ent_typ_lbl,start_chr,end_chr,start_wrd,end_wrd,text
0,PMC5743442,CELL_TYPE,CTBFBDE5121B6748D1,Th17,65,69,8,9,Th17
1,PMC5743442,CELL_TYPE,CTBFBDE5121B6748D1,Th17,87,91,11,12,Th17
2,PMC5743442,CYTOKINE,CK1C5A7BDC959D1365,TGF-β1,174,179,25,26,TGF-β
3,PMC5743442,CELL_TYPE,CTBFBDE5121B6748D1,Th17,199,203,29,30,Th17
4,PMC5743442,CYTOKINE,CK1C5A7BDC959D1365,TGF-β1,275,280,42,43,TGF-β


In [21]:
path = osp.join(corpus_dir, 'tags.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/tags.csv'

### Relation Export

In [22]:
from bratreader.repomodel import RepoModel

repo_model = RepoModel(collection_dir)

In [23]:
repo_model.documents.keys()

dict_keys(['PMC2646571', 'PMC3304099', 'PMC2634967', 'PMC2193209', 'PMC3095633', 'PMC3189223', 'PMC3235500', 'PMC3173465', 'PMC3046151', 'PMC2938478'])

In [24]:
doc = repo_model.documents['PMC2646571']
annot = doc.annotations[1]

In [25]:
annot.labels, annot.links, annot.realspan, annot.spans, annot.repr, annot.words

(defaultdict(list, {'CYTOKINE': []}),
 defaultdict(list, {}),
 (39, 44),
 [[39, 44]],
 'TGF-β',
 [<bratreader.word.Word at 0x202958f60>])

In [26]:
doc.key

'PMC2646571'

In [46]:
def get_relations(doc):
    relations = []
    for a2 in doc.annotations:
        # Links only exist as INCOMING links meaning that a2 is the target of relations
        # and a1 will be the source
        if a2.links:
            for relation_type, annots in a2.links.items():
                for a1 in annots:
                    assert len(a1.labels) == len(a2.labels) == 1
                    relations.append(dict(
                        id=doc.key,
                        rel_typ=relation_type, 
                        e1_typ=list(a1.labels.keys())[0], e1_start_chr=a1.realspan[0], e1_end_chr=a1.realspan[1], e1_text=a1.repr,
                        e2_typ=list(a2.labels.keys())[0], e2_start_chr=a2.realspan[0], e2_end_chr=a2.realspan[1], e2_text=a2.repr
                    ))
    return relations
df = pd.DataFrame([r for k, doc in repo_model.documents.items() for r in get_relations(doc)])
df = df.drop_duplicates()
df.head()

Unnamed: 0,e1_end_chr,e1_start_chr,e1_text,e1_typ,e2_end_chr,e2_start_chr,e2_text,e2_typ,id,rel_typ
0,24,19,Gfi-1,TF,85,81,Th17,CELL_TYPE,PMC2646571,Differentiation
1,44,39,TGF-β,CYTOKINE,85,81,Th17,CELL_TYPE,PMC2646571,Induction
2,44,39,TGF-β,CYTOKINE,125,90,CD103+ inducible regulatory T cells,CELL_TYPE,PMC2646571,Induction
3,24,19,Gfi-1,TF,125,90,CD103+ inducible regulatory T cells,CELL_TYPE,PMC2646571,Differentiation
12,371,366,Gfi-1,TF,436,433,Th2,CELL_TYPE,PMC2646571,Differentiation


In [49]:
df.groupby(['e1_typ', 'e2_typ', 'rel_typ']).size()

e1_typ     e2_typ     rel_typ        
CELL_TYPE  CYTOKINE   Secretion          58
CYTOKINE   CELL_TYPE  Induction          84
TF         CELL_TYPE  Differentiation    70
dtype: int64

In [36]:
#df_exp[df_exp['id'] == 'PMC3304099']

In [50]:
path = osp.join(corpus_dir, 'relations.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/relations.csv'