In [1]:
import os
import tqdm
import os.path as osp
import pandas as pd
import glob
import spacy
%run env.py
%run src/lib.py
%run src/integration.py
fix_jupyter_spacy_config()
corpus_dir = osp.join(DATA_DIR, 'articles', 'corpus', 'corpus_00')
#collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_01')
collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_02')
corpus_docs_path = osp.join(corpus_dir, 'docs')
corpus_links_dir = osp.join(corpus_dir, 'links')

### Link Docs

Link annotated documents (which should ALL be annotated when present) and unannotated docs in a single directory:

In [2]:
files = []
# Add unannotated text files and THEN annotated text files as links will overwrite
f = glob.glob(osp.join(corpus_docs_path, '*.txt'))
assert f, 'No unannotated text files found'
print('Found {} unnannotated text files'.format(len(f)))
files.extend(f)
f = glob.glob(osp.join(collection_dir, '*.txt'))
assert f, 'No annotated text files found'
print('Found {} annotated text files'.format(len(f)))
files.extend(f)
len(files)

Found 500 unnannotated text files
Found 109 annotated text files


609

In [3]:
corpus_files = []
for f in files:
    if not osp.exists(corpus_links_dir):
        os.makedirs(corpus_links_dir)
    path = osp.join(corpus_links_dir, osp.basename(f))
    if osp.exists(path):
        os.unlink(path)
    os.symlink(f, path)
    corpus_files.append(path)
print('Linked {} files in {}'.format(len(corpus_files), corpus_links_dir))

Linked 609 files in /Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/links


### Define Pipeline

In [4]:
nlp = get_scispacy_pipeline()
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1147f5390>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x114937708>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x11f512648>)]

In [5]:
from spacy.pipeline import EntityRuler
from spacy.tokens import Span

if not Span.has_extension('id'):
    Span.set_extension('id', default=None)
if not Span.has_extension('lbl'):
    Span.set_extension('lbl', default=None)

class EntityRefiner(object):
    name = 'refiner'
    
    def __init__(self, nlp, df):
        self.ruler = EntityRuler(nlp, overwrite_ents=True)
        self.df = df.copy().set_index('id')
        patterns = []
        for rid, r in self.df.iterrows():
            tokens = nlp.tokenizer(r['sym']) # returns Doc
            pattern = [{'lower': t.lower_} for t in tokens]
            # Assign label as record id (e.g. "CKBF6003C60D23BA0D")
            patterns.append({'label': rid, 'pattern': pattern})
        self.ruler.add_patterns(patterns)
        
    def __call__(self, doc):
        doc = self.ruler(doc)
        ents = []
        for ent in doc.ents:
            if ent.label_ not in self.df.index:
                ents.append(ent)
                continue
            # Convert label of span from record id to entity type 
            # (e.g. CKBF6003C60D23BA0D -> CYTOKINE)
            r = self.df.loc[ent.label_]
            rid = ent.label_
            ent = Span(doc, ent.start, ent.end, label=r['type'])
            ent._.id = rid
            ent._.lbl = r['lbl']
            ents.append(ent)
        doc.ents = ents
        return doc 

df = pd.concat([
    get_entity_meta_data(CYTOKINES).assign(type='CYTOKINE'),
    get_entity_meta_data(TRANSCRIPTION_FACTORS).assign(type='TRANSCRIPTION_FACTOR'),
    get_entity_meta_data(CELL_TYPES).assign(type='IMMUNE_CELL_TYPE')
], sort=True)
if nlp.has_pipe(EntityRefiner.name):
    nlp.remove_pipe(EntityRefiner.name)
nlp.add_pipe(EntityRefiner(nlp, df), last=True)

In [7]:
#doc = nlp('The Vγ4 T lymphocyte subset predominated among the IL-17+ cell populations present in the lungs of CLP mice (unlike Vγ1 and αβ T lymphocytes) and was strongly biased toward IL-17 rather than toward IFN-γ production')
#doc.ents

In [8]:
# #path = '/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/links/PMC4767817.txt'
# path = '/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/links/PMC3235500.txt'
# doc = None
# with open(path, 'r') as fd:
#     text = fd.read()
#     doc = nlp(text)
#     #doc._.pmc_id = osp.basename(path).replace('.txt', '')

In [12]:
#nlp.pipeline[-1][1].ck_tagger.df.loc['TGF-β']

In [13]:
# with open(osp.join(output_corpus_dir, 'PMC3925027.txt'), 'r') as fd:
#     doc = fd.read()
#doc = nlp(doc)
#spacy.displacy.render(doc, style='ent', jupyter=True)

### Run Pipeline

In [None]:
from spacy.tokens import Doc
if not Doc.has_extension('pmc_id'):
    Doc.set_extension('pmc_id', default=None)
    
def get_docs():
    for f in tqdm.tqdm_notebook(corpus_files):
        with open(f, 'r') as fd:
            text = fd.read()
            try:
                doc = nlp(text)
                doc._.pmc_id = osp.basename(f).replace('.txt', '')
                yield doc
            except:
                print('Failed to process doc at {}'.format(f))
                raise
docs = [d for d in get_docs()]

HBox(children=(IntProgress(value=0, max=609), HTML(value='')))

In [19]:
# spacy.displacy.render(docs[3], style='ent', jupyter=True)

### Tag Export

In [21]:
from collections import OrderedDict
df = pd.DataFrame([
    OrderedDict(
        id=doc._.pmc_id, type=ent.label_, 
        ent_id=ent._.id, ent_lbl=ent._.lbl,
        start_chr=ent.start_char, end_chr=ent.end_char, 
        start_wrd=ent.start, end_wrd=ent.end, 
        text=ent.text
    ) 
    for doc in docs 
    for ent in doc.ents
])
df.head()

Unnamed: 0,id,type,ent_id,ent_lbl,start_chr,end_chr,start_wrd,end_wrd,text
0,PMC5743442,PROTEIN,,,10,19,1,2,Ski-Smad4
1,PMC5743442,TCELL_TYPE,CTBFBDE5121B6748D1,Th17,65,69,8,9,Th17
2,PMC5743442,TCELL_TYPE,CTBFBDE5121B6748D1,Th17,87,91,11,12,Th17
3,PMC5743442,CYTOKINE,CK1C5A7BDC959D1365,TGF-β1,174,179,25,26,TGF-β
4,PMC5743442,TCELL_TYPE,CTBFBDE5121B6748D1,Th17,199,203,29,30,Th17


In [22]:
path = osp.join(corpus_dir, 'tags.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/tags.csv'

### Relation Export

In [3]:
from bratreader.repomodel import RepoModel
repo_model = RepoModel(collection_dir)

In [13]:
len(repo_model.documents)

110

In [None]:
# repo_model.documents.keys()

In [None]:
# doc = repo_model.documents['PMC2646571']
# annot = doc.annotations[1]

In [None]:
# annot.labels, annot.links, annot.realspan, annot.spans, annot.repr, annot.words

In [None]:
# doc.key

In [8]:
def get_relations(doc):
    relations = []
    for a2 in doc.annotations:
        # Links only exist as INCOMING links meaning that a2 is the target of relations
        # and a1 will be the source
        if a2.links:
            for relation_type, annots in a2.links.items():
                for a1 in annots:
                    assert len(a1.labels) == len(a2.labels) == 1
                    relations.append(dict(
                        id=doc.key,
                        rel_typ=relation_type, 
                        e1_typ=list(a1.labels.keys())[0], e1_start_chr=a1.realspan[0], e1_end_chr=a1.realspan[1], e1_text=a1.repr,
                        e2_typ=list(a2.labels.keys())[0], e2_start_chr=a2.realspan[0], e2_end_chr=a2.realspan[1], e2_text=a2.repr
                    ))
    return relations
df = pd.DataFrame([r for k, doc in repo_model.documents.items() for r in get_relations(doc)])
df = df.drop_duplicates()
df.head()

Unnamed: 0,e1_end_chr,e1_start_chr,e1_text,e1_typ,e2_end_chr,e2_start_chr,e2_text,e2_typ,id,rel_typ
0,24,19,Gfi-1,TF,85,81,Th17,CELL_TYPE,PMC2646571,Differentiation
1,44,39,TGF-β,CYTOKINE,85,81,Th17,CELL_TYPE,PMC2646571,Induction
2,44,39,TGF-β,CYTOKINE,125,90,CD103+ inducible regulatory T cells,CELL_TYPE,PMC2646571,Induction
3,24,19,Gfi-1,TF,125,90,CD103+ inducible regulatory T cells,CELL_TYPE,PMC2646571,Differentiation
12,371,366,Gfi-1,TF,436,433,Th2,CELL_TYPE,PMC2646571,Differentiation


In [14]:
df.groupby(['e1_typ', 'e2_typ', 'rel_typ']).size()

e1_typ     e2_typ     rel_typ        
CELL_TYPE  CYTOKINE   Secretion          79
CYTOKINE   CELL_TYPE  Induction          96
TF         CELL_TYPE  Differentiation    83
dtype: int64

In [15]:
#df_exp[df_exp['id'] == 'PMC3304099']

In [16]:
path = osp.join(corpus_dir, 'relations.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/relations.csv'