In [1]:
import os
import tqdm
import os.path as osp
import pandas as pd
import glob
import spacy
%run env.py
%run src/lib.py
%run src/integration.py
%run src/supervision.py

fix_jupyter_spacy_config()

#corpus_dir = osp.join(DATA_DIR, 'articles', 'corpus', 'corpus_00')
#collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_02')

corpus_dir = osp.join(DATA_DIR, 'articles', 'corpus', 'corpus_01')
collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_02')

corpus_docs_path = osp.join(corpus_dir, 'docs')
corpus_links_dir = osp.join(corpus_dir, 'links')

### Link Docs

Link annotated documents (which should ALL be annotated when present) and unannotated docs in a single directory:

In [2]:
files = []
# Add unannotated text files and THEN annotated text files as links will overwrite
f = glob.glob(osp.join(corpus_docs_path, '*.txt'))
assert f, 'No unannotated text files found'
print('Found {} unnannotated text files'.format(len(f)))
files.extend(f)
f = glob.glob(osp.join(collection_dir, '*.txt'))
assert f, 'No annotated text files found'
print('Found {} annotated text files'.format(len(f)))
files.extend(f)
len(files)

Found 10000 unnannotated text files
Found 89 annotated text files


10089

In [3]:
# Clear existing linked files
for f in glob.glob(osp.join(corpus_links_dir, '*.txt')):
    if osp.islink(f):
        os.unlink(f)
!ls -q $corpus_links_dir/*.txt 2> /dev/null

In [4]:
# Add new links
corpus_files = []
for f in files:
    if not osp.exists(corpus_links_dir):
        os.makedirs(corpus_links_dir)
    path = osp.join(corpus_links_dir, osp.basename(f))
    # Overwrite despite initial clearing as docs from annotation
    # directory are preferred
    if osp.exists(path):
        os.unlink(path)
    os.symlink(f, path)
    corpus_files.append(path)
print('Linked {} files in {}'.format(len(corpus_files), corpus_links_dir))

Linked 10089 files in /Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_01/links


### Define Pipeline

In [8]:
nlp = get_scispacy_pipeline()
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x11dfb0208>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x11e0f5528>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x129229468>)]

In [9]:
from spacy.pipeline import EntityRuler
from spacy.tokens import Span

if not Span.has_extension('id'):
    Span.set_extension('id', default=None)
if not Span.has_extension('prefid'):
    Span.set_extension('prefid', default=None)
if not Span.has_extension('lbl'):
    Span.set_extension('lbl', default=None)


class EntityRefiner(object):
    name = 'refiner'
    
    def __init__(self, nlp, df):
        self.ruler = EntityRuler(nlp, overwrite_ents=True)
        self.df = df.copy().set_index('id')
        patterns = []
        for rid, r in self.df.iterrows():
            # TODO: add match types into meta data frames
            # For now, make exception for common symbols to do case-sensitive match
            if r['sym'].upper() in ['NOT', 'GENESIS', 'MINOR', 'AIM', 'OUT']:
                pattern = [{'text': r['sym'].upper()}]
            # Otherwise, use case-insensitive match
            else:
                tokens = nlp.tokenizer(r['sym']) # returns Doc
                pattern = [{'lower': t.lower_} for t in tokens]
                
            # Assign label as record id (e.g. "CKBF6003C60D23BA0D")
            patterns.append({'label': rid, 'pattern': pattern})
        self.ruler.add_patterns(patterns)
        
    def __call__(self, doc):
        doc = self.ruler(doc)
        ents = []
        for ent in doc.ents:
            if ent.label_ not in self.df.index:
                ents.append(ent)
                continue
            # Convert label of span from record id to entity type 
            # (e.g. CKBF6003C60D23BA0D -> CYTOKINE)
            r = self.df.loc[ent.label_]
            rid = ent.label_
            ent = Span(doc, ent.start, ent.end, label=r['type'])
            ent._.id = rid
            ent._.prefid = r['prefid']
            ent._.lbl = r['lbl']
            ents.append(ent)
        doc.ents = ents
        return doc 

df = pd.concat([
    get_entity_meta_data(CYTOKINES).assign(type=ENT_TYP_CK),
    get_entity_meta_data(TRANSCRIPTION_FACTORS).assign(type=ENT_TYP_TF),
    get_entity_meta_data(CELL_TYPES).assign(type=ENT_TYP_CT)
], sort=True)
if nlp.has_pipe(EntityRefiner.name):
    nlp.remove_pipe(EntityRefiner.name)
nlp.add_pipe(EntityRefiner(nlp, df), last=True)

In [7]:
#doc = nlp('The Vγ4 T lymphocyte subset predominated among the IL-17+ cell populations present in the lungs of CLP mice (unlike Vγ1 and αβ T lymphocytes) and was strongly biased toward IL-17 rather than toward IFN-γ production')
#doc.ents

In [8]:
# #path = '/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/links/PMC4767817.txt'
# path = '/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/links/PMC3235500.txt'
# doc = None
# with open(path, 'r') as fd:
#     text = fd.read()
#     doc = nlp(text)
#     #doc._.pmc_id = osp.basename(path).replace('.txt', '')

In [9]:
#nlp.pipeline[-1][1].ck_tagger.df.loc['TGF-β']

In [10]:
# with open(osp.join(output_corpus_dir, 'PMC3925027.txt'), 'r') as fd:
#     doc = fd.read()
#doc = nlp(doc)
#spacy.displacy.render(doc, style='ent', jupyter=True)

### Run Pipeline

In [None]:
from spacy.tokens import Doc
if not Doc.has_extension('pmc_id'):
    Doc.set_extension('pmc_id', default=None)
    
def text_stream():
    for f in tqdm.tqdm_notebook(corpus_files):
        doc_id = osp.basename(f).replace('.txt', '')
        with open(f, 'r') as fd:
            text = fd.read()
            # Clip to max length to avoid error like
            # ValueError: [E088] Text of length 1290071 exceeds maximum of 1000000
            if len(text) > nlp.max_length:
                text = text[:nlp.max_length]
            yield text, {'doc_id': doc_id}
    
def doc_stream():
    for doc, ctx in nlp.pipe(text_stream(), as_tuples=True, n_threads=3, batch_size=10):
        doc._.pmc_id = ctx['doc_id']
        yield doc
        
docs = [d for d in doc_stream()]

HBox(children=(IntProgress(value=0, max=10089), HTML(value='')))

In [11]:
len(docs)

10089

In [12]:
# spacy.displacy.render(docs[3], style='ent', jupyter=True)
spacy.displacy.render(
    [doc for doc in docs if doc._.pmc_id == 'PMC5591438'], 
    style='ent', jupyter=True, options=DISPLACY_ENT_OPTS
)

### Tag Export

In [13]:
from collections import OrderedDict
df = pd.DataFrame([
    OrderedDict(
        id=doc._.pmc_id, type=ent.label_, 
        ent_id=ent._.id, ent_lbl=ent._.lbl, ent_prefid=ent._.prefid,
        start_chr=ent.start_char, end_chr=ent.end_char, 
        start_wrd=ent.start, end_wrd=ent.end, 
        text=ent.text
    ) 
    for doc in docs 
    for ent in doc.ents
])
df.head()

Unnamed: 0,id,type,ent_id,ent_lbl,ent_prefid,start_chr,end_chr,start_wrd,end_wrd,text
0,PMC5704053,PROTEIN,,,,15,18,2,3,CD4
1,PMC5704053,IMMUNE_CELL_TYPE,CT30BC86BDEF7B1410,Treg,CTB574584AD019ABB8,26,38,7,9,Regulatory T
2,PMC5704053,IMMUNE_CELL_TYPE,CT30BC86BDEF7B1410,Treg,CTB574584AD019ABB8,125,137,29,31,regulatory T
3,PMC5704053,IMMUNE_CELL_TYPE,CTB574584AD019ABB8,Treg,CTB574584AD019ABB8,145,149,33,34,Treg
4,PMC5704053,IMMUNE_CELL_TYPE,CTB574584AD019ABB8,Treg,CTB574584AD019ABB8,327,331,63,64,Treg


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4007030 entries, 0 to 4007029
Data columns (total 10 columns):
id            object
type          object
ent_id        object
ent_lbl       object
ent_prefid    object
start_chr     int64
end_chr       int64
start_wrd     int64
end_wrd       int64
text          object
dtypes: int64(4), object(6)
memory usage: 305.7+ MB


In [16]:
path = osp.join(corpus_dir, 'tags.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_01/tags.csv'

### Relation Export

Export relations to be used to generate gold labellings within Snorkel

In [18]:
from bratreader.repomodel import RepoModel
repo_model = RepoModel(collection_dir)

In [19]:
len(repo_model.documents)

89

In [24]:
# repo_model.documents.keys()

In [25]:
# doc = repo_model.documents['PMC2646571']
# annot = doc.annotations[1]

In [26]:
# annot.labels, annot.links, annot.realspan, annot.spans, annot.repr, annot.words

In [27]:
# doc.key

In [20]:
def get_relations(doc):
    relations = []
    
    # Map from entity type label used in annotation to label used within snorkel
    ent_typ_map = {
        'TF': ENT_TYP_TF,
        'CELL_TYPE': ENT_TYP_CT,
        'CYTOKINE': ENT_TYP_CK
    }
    
    for a2 in doc.annotations:
        # Links only exist as INCOMING links meaning that a2 is the target of relations
        # and a1 will be the source
        if a2.links:
            for relation_type, annots in a2.links.items():
                for a1 in annots:
                    assert len(a1.labels) == len(a2.labels) == 1
                    a2_typ = ent_typ_map[list(a2.labels.keys())[0]]
                    
                    # For consistency with snorkel convention, ensure that the second
                    # value in relation entry is the cell type and all others are first
                    e1, e2 = a1, a2
                    if a2_typ != ENT_TYP_CT:
                        e1, e2 = a2, a1
                    
                    relations.append(dict(
                        id=doc.key,
                        rel_typ=relation_type, 
                        e1_typ=ent_typ_map[list(e1.labels.keys())[0]], 
                        e1_start_chr=e1.realspan[0], 
                        e1_end_chr=e1.realspan[1], 
                        e1_text=e1.repr,
                        e2_typ=ent_typ_map[list(e2.labels.keys())[0]], 
                        e2_start_chr=e2.realspan[0], 
                        e2_end_chr=e2.realspan[1], 
                        e2_text=e2.repr
                    ))
    return relations
df = pd.DataFrame([r for k, doc in repo_model.documents.items() for r in get_relations(doc)])
df = df.drop_duplicates()
df.head()

Unnamed: 0,e1_end_chr,e1_start_chr,e1_text,e1_typ,e2_end_chr,e2_start_chr,e2_text,e2_typ,id,rel_typ
0,24,19,Gfi-1,TRANSCRIPTION_FACTOR,85,81,Th17,IMMUNE_CELL_TYPE,PMC2646571,Differentiation
1,44,39,TGF-β,CYTOKINE,85,81,Th17,IMMUNE_CELL_TYPE,PMC2646571,Induction
2,44,39,TGF-β,CYTOKINE,119,97,inducible regulatory T,IMMUNE_CELL_TYPE,PMC2646571,Induction
3,24,19,Gfi-1,TRANSCRIPTION_FACTOR,119,97,inducible regulatory T,IMMUNE_CELL_TYPE,PMC2646571,Differentiation
8,371,366,Gfi-1,TRANSCRIPTION_FACTOR,436,433,Th2,IMMUNE_CELL_TYPE,PMC2646571,Differentiation


In [21]:
df.groupby(['e1_typ', 'e2_typ', 'rel_typ']).size()

e1_typ                e2_typ            rel_typ        
CYTOKINE              IMMUNE_CELL_TYPE  Induction          150
                                        Secretion          131
TRANSCRIPTION_FACTOR  IMMUNE_CELL_TYPE  Differentiation    119
dtype: int64

In [22]:
#df_exp[df_exp['id'] == 'PMC3304099']

In [23]:
path = osp.join(corpus_dir, 'relations.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_01/relations.csv'