In [1]:
import os
import tqdm
import os.path as osp
import pandas as pd
import glob
import spacy
%run env.py
%run src/lib.py
%run src/integration.py
%run src/supervision.py

fix_jupyter_spacy_config()

#corpus_dir = osp.join(DATA_DIR, 'articles', 'corpus', 'corpus_00')
#collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_02')

corpus_dir = osp.join(DATA_DIR, 'articles', 'corpus', 'corpus_01')
collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_02')

corpus_docs_path = osp.join(corpus_dir, 'docs')
corpus_links_dir = osp.join(corpus_dir, 'links')

### Link Docs

Link annotated documents (which should ALL be annotated when present) and unannotated docs in a single directory:

In [2]:
files = []
# Add unannotated text files and THEN annotated text files as links will overwrite
f = glob.glob(osp.join(corpus_docs_path, '*.txt'))
assert f, 'No unannotated text files found'
print('Found {} unnannotated text files'.format(len(f)))
files.extend(f)
f = glob.glob(osp.join(collection_dir, '*.txt'))
assert f, 'No annotated text files found'
print('Found {} annotated text files'.format(len(f)))
files.extend(f)
len(files)

Found 10000 unnannotated text files
Found 89 annotated text files


10089

In [3]:
# Clear existing linked files
for f in glob.glob(osp.join(corpus_links_dir, '*.txt')):
    if osp.islink(f):
        os.unlink(f)
!ls -q $corpus_links_dir/*.txt 2> /dev/null

In [4]:
# Add new links
corpus_files = []
for f in files:
    if not osp.exists(corpus_links_dir):
        os.makedirs(corpus_links_dir)
    path = osp.join(corpus_links_dir, osp.basename(f))
    # Overwrite despite initial clearing as docs from annotation
    # directory are preferred
    if osp.exists(path):
        os.unlink(path)
    os.symlink(f, path)
    corpus_files.append(path)
print('Linked {} files in {}'.format(len(corpus_files), corpus_links_dir))

Linked 10089 files in /Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_01/links


### Define Pipeline

In [56]:
nlp = get_scispacy_pipeline()
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x155dffbe0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x138297588>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x14504c408>)]

In [57]:
class EntityFilter(object):
    name = 'filter'
    
    def __init__(self, types):
        self.types = types
        
    def __call__(self, doc):
        if self.types is None:
            return doc
        doc.ents = [e for e in doc.ents if e.label_ in self.types]
        return doc
    
if nlp.has_pipe(EntityFilter.name):
    nlp.remove_pipe(EntityFilter.name)
#nlp.add_pipe(EntityFilter(['CELL_TYPE', 'CELL_LINE', 'PROTEIN']), last=True)
nlp.add_pipe(EntityFilter(['CELL_TYPE', 'CELL_LINE']), after='ner')

In [98]:
from spacy.tokens import Doc

def copy_ent(ent):
    return dict(
        type=ent.label_,
        meta=ent._.meta if hasattr(ent._, 'meta') else None,
        start_chr=ent.start_char,
        end_chr=ent.end_char,
        start_wrd=ent.start,
        end_wrd=ent.end,
        text=ent.text
    )

class EntityRelocator(object):
    
    def __init__(self, attr):
        self.attr = attr
        if not Doc.has_extension(attr):
            Doc.set_extension(attr, default=None)

    def __call__(self, doc):
        setattr(doc._, self.attr, [copy_ent(e) for e in doc.ents])
        doc.ents = []
        return doc

if nlp.has_pipe('reloc_jnlpba'):
    nlp.remove_pipe('reloc_jnlpba')
nlp.add_pipe(EntityRelocator('ents_jnlpba'), after=EntityFilter.name, name='reloc_jnlpba')

In [99]:
from spacy.pipeline import EntityRuler
from spacy.tokens import Span

if not Span.has_extension('meta'):
    Span.set_extension('meta', default=None)

class EntityRefiner(object):
    name = 'refiner'
    
    def __init__(self, nlp, df):
        self.ruler = EntityRuler(nlp, overwrite_ents=True)
        self.df = df.copy().set_index('id')
        patterns = []
        for rid, r in self.df.iterrows():
            # TODO: add match types into meta data frames
            tokens = nlp.tokenizer(r['sym']) # returns Doc
            pattern = [{'lower': t.lower_} for t in tokens]
                
            # Assign label as record id (e.g. "CKBF6003C60D23BA0D")
            patterns.append({'label': rid, 'pattern': pattern})
        self.ruler.add_patterns(patterns)
        
    def __call__(self, doc):
        doc = self.ruler(doc)
        ents = []
        for ent in doc.ents:
            if ent.label_ not in self.df.index:
                ents.append(ent)
                continue
            # Convert label of span from record id to entity type 
            # (e.g. CKBF6003C60D23BA0D -> CYTOKINE)
            r = self.df.loc[ent.label_]
            rid = ent.label_
            ent = Span(doc, ent.start, ent.end, label=r['type'])
            ent._.meta = dict(id=rid, prefid=r['prefid'], lbl=r['lbl'])
            ents.append(ent)
        doc.ents = ents
        return doc 

df = pd.concat([
    get_entity_meta_data(CYTOKINES).assign(type=ENT_TYP_CK),
    get_entity_meta_data(TRANSCRIPTION_FACTORS).assign(type=ENT_TYP_TF),
    get_entity_meta_data(CELL_TYPES).assign(type=ENT_TYP_CT)
], sort=True)
if nlp.has_pipe(EntityRefiner.name):
    nlp.remove_pipe(EntityRefiner.name)
nlp.add_pipe(EntityRefiner(nlp, df), after='reloc_jnlpba')

In [100]:
if nlp.has_pipe('reloc_lkp'):
    nlp.remove_pipe('reloc_lkp')
nlp.add_pipe(EntityRelocator('ents_lkp'), after=EntityRefiner.name, name='reloc_lkp')

In [101]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x155dffbe0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x138297588>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x14504c408>),
 ('filter', <__main__.EntityFilter at 0x148c124e0>),
 ('reloc_jnlpba', <__main__.EntityRelocator at 0x14bbc0c88>),
 ('refiner', <__main__.EntityRefiner at 0x14bba1ba8>),
 ('reloc_lkp', <__main__.EntityRelocator at 0x156090be0>)]

In [102]:
# doc = nlp('Some T-helper-17 cells secreting IL2 and STAT3 where CD4 is expressed on surface of antigen-presenting DCs')

In [103]:
from collections import OrderedDict
from spacy.tokens import Doc
if not Doc.has_extension('pmc_id'):
    Doc.set_extension('pmc_id', default=None)
    
def to_df(doc):
    df = []
    ents = [(e, 'jnlpba') for e in doc._.ents_jnlpba] + [(e, 'lkp') for e in doc._.ents_lkp]
    for ent, src in ents:
        meta = ent['meta'] or {}
        df.append({
            **dict(
                id=doc._.pmc_id, 
                ent_id=meta.get('id'),
                ent_lbl=meta.get('lbl'),
                ent_prefid=meta.get('prefid'),
                ent_src=src
            ),
            **{k:v for k, v in ent.items() if k != 'meta'}
        })
    return pd.DataFrame(df)

In [105]:
# #doc = nlp('The Vγ4 T lymphocyte subset predominated among the IL-17+ cell populations and the TCM in the lungs of CLP mice (unlike Vγ1 and αβ T lymphocytes) and was strongly biased toward IL-17 rather than toward IFN-γ production')
# doc = nlp(u'We found this occurred through the activation of tumor-infiltrating, exhausted CD8+ T cells (CD8+ TILs) that had lost most of the original functions, such as the ability to produce multiple cytokines and cytotoxicity. CD8+ TILs of mice exposed to Met begin to rapidly produce multiple cytokines, including interleukin-2 (IL-2), tumor necrosis factor alpha (TNFα), and interferon gamma (IFNγ), and differentiate into effector memory T cells (TEM); otherwise, central memory T cells (TCM) are dominant in the tumor microenvironment')
# to_df(doc)

In [106]:
#df[(df['ent_src'] == 'jnlpba') & (df['ent_id'].notnull())].head()

In [35]:
# path = '/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_01/links/PMC5704053.txt'
# doc = None
# with open(path, 'r') as fd:
#     text = fd.read()
#     doc = nlp(text)
#     #doc._.pmc_id = osp.basename(path).replace('.txt', '')

In [9]:
#nlp.pipeline[-1][1].ck_tagger.df.loc['TGF-β']

In [10]:
# with open(osp.join(output_corpus_dir, 'PMC3925027.txt'), 'r') as fd:
#     doc = fd.read()
#doc = nlp(doc)
#spacy.displacy.render(doc, style='ent', jupyter=True)

### Run Pipeline

In [15]:
def text_stream(files):
    for f in tqdm.tqdm_notebook(files):
        doc_id = osp.basename(f).replace('.txt', '')
        with open(f, 'r') as fd:
            text = fd.read()
            # Clip to max length to avoid error like
            # ValueError: [E088] Text of length 1290071 exceeds maximum of 1000000
            if len(text) > nlp.max_length:
                text = text[:nlp.max_length]
            yield text, {'doc_id': doc_id}
    
def doc_stream(files):
    for doc, ctx in nlp.pipe(text_stream(files), as_tuples=True, n_threads=3, batch_size=10):
        doc._.pmc_id = ctx['doc_id']
        yield doc

def frame_stream(files):
    for doc in doc_stream(files):
        yield to_df(doc)

In [19]:
pd.set_options('display.max_info_rows', 10000000)
df = pd.concat(list(frame_stream(corpus_files)))
df.info()

HBox(children=(IntProgress(value=0, max=10089), HTML(value='')))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2311011 entries, 0 to 170
Data columns (total 11 columns):
id            object
type          object
ent_id        object
ent_lbl       object
ent_prefid    object
ent_src       object
start_chr     int64
end_chr       int64
start_wrd     int64
end_wrd       int64
text          object
dtypes: int64(4), object(7)
memory usage: 211.6+ MB


In [25]:
# df[(df['ent_src'] == 'jnlpba') & (df['ent_id'].notnull())].head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2311011 entries, 0 to 170
Data columns (total 11 columns):
id            2311011 non-null object
type          2311011 non-null object
ent_id        987781 non-null object
ent_lbl       987781 non-null object
ent_prefid    987781 non-null object
ent_src       2311011 non-null object
start_chr     2311011 non-null int64
end_chr       2311011 non-null int64
start_wrd     2311011 non-null int64
end_wrd       2311011 non-null int64
text          2311011 non-null object
dtypes: int64(4), object(7)
memory usage: 211.6+ MB


In [74]:
# files = [f for f in corpus_files if 'PMC5591438' in f]
# docs = [d for d in doc_stream(files)]

In [73]:
# # spacy.displacy.render(docs[3], style='ent', jupyter=True)
# spacy.displacy.render(
#     [doc for doc in docs], 
#     style='ent', jupyter=True, options=DISPLACY_ENT_OPTS
# )

### Tag Export

In [27]:
pd.set_option('display.max_info_rows', 10000000)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2311011 entries, 0 to 170
Data columns (total 11 columns):
id            2311011 non-null object
type          2311011 non-null object
ent_id        987781 non-null object
ent_lbl       987781 non-null object
ent_prefid    987781 non-null object
ent_src       2311011 non-null object
start_chr     2311011 non-null int64
end_chr       2311011 non-null int64
start_wrd     2311011 non-null int64
end_wrd       2311011 non-null int64
text          2311011 non-null object
dtypes: int64(4), object(7)
memory usage: 211.6+ MB


In [24]:
path = osp.join(corpus_dir, 'tags.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_01/tags-union.csv'

### Relation Export

Export relations to be used to generate gold labellings within Snorkel

In [18]:
from bratreader.repomodel import RepoModel
repo_model = RepoModel(collection_dir)

In [19]:
len(repo_model.documents)

89

In [24]:
# repo_model.documents.keys()

In [25]:
# doc = repo_model.documents['PMC2646571']
# annot = doc.annotations[1]

In [26]:
# annot.labels, annot.links, annot.realspan, annot.spans, annot.repr, annot.words

In [27]:
# doc.key

In [20]:
def get_relations(doc):
    relations = []
    
    # Map from entity type label used in annotation to label used within snorkel
    ent_typ_map = {
        'TF': ENT_TYP_TF,
        'CELL_TYPE': ENT_TYP_CT,
        'CYTOKINE': ENT_TYP_CK
    }
    
    for a2 in doc.annotations:
        # Links only exist as INCOMING links meaning that a2 is the target of relations
        # and a1 will be the source
        if a2.links:
            for relation_type, annots in a2.links.items():
                for a1 in annots:
                    assert len(a1.labels) == len(a2.labels) == 1
                    a2_typ = ent_typ_map[list(a2.labels.keys())[0]]
                    
                    # For consistency with snorkel convention, ensure that the second
                    # value in relation entry is the cell type and all others are first
                    e1, e2 = a1, a2
                    if a2_typ != ENT_TYP_CT:
                        e1, e2 = a2, a1
                    
                    relations.append(dict(
                        id=doc.key,
                        rel_typ=relation_type, 
                        e1_typ=ent_typ_map[list(e1.labels.keys())[0]], 
                        e1_start_chr=e1.realspan[0], 
                        e1_end_chr=e1.realspan[1], 
                        e1_text=e1.repr,
                        e2_typ=ent_typ_map[list(e2.labels.keys())[0]], 
                        e2_start_chr=e2.realspan[0], 
                        e2_end_chr=e2.realspan[1], 
                        e2_text=e2.repr
                    ))
    return relations
df = pd.DataFrame([r for k, doc in repo_model.documents.items() for r in get_relations(doc)])
df = df.drop_duplicates()
df.head()

Unnamed: 0,e1_end_chr,e1_start_chr,e1_text,e1_typ,e2_end_chr,e2_start_chr,e2_text,e2_typ,id,rel_typ
0,24,19,Gfi-1,TRANSCRIPTION_FACTOR,85,81,Th17,IMMUNE_CELL_TYPE,PMC2646571,Differentiation
1,44,39,TGF-β,CYTOKINE,85,81,Th17,IMMUNE_CELL_TYPE,PMC2646571,Induction
2,44,39,TGF-β,CYTOKINE,119,97,inducible regulatory T,IMMUNE_CELL_TYPE,PMC2646571,Induction
3,24,19,Gfi-1,TRANSCRIPTION_FACTOR,119,97,inducible regulatory T,IMMUNE_CELL_TYPE,PMC2646571,Differentiation
8,371,366,Gfi-1,TRANSCRIPTION_FACTOR,436,433,Th2,IMMUNE_CELL_TYPE,PMC2646571,Differentiation


In [21]:
df.groupby(['e1_typ', 'e2_typ', 'rel_typ']).size()

e1_typ                e2_typ            rel_typ        
CYTOKINE              IMMUNE_CELL_TYPE  Induction          150
                                        Secretion          131
TRANSCRIPTION_FACTOR  IMMUNE_CELL_TYPE  Differentiation    119
dtype: int64

In [22]:
#df_exp[df_exp['id'] == 'PMC3304099']

# Export

In [23]:
path = osp.join(corpus_dir, 'relations.csv')
df.to_csv(path, index=False)
path

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_01/relations.csv'

<hr>

### Tag EDA

In [17]:
import pandas as pd
import os.path as osp
df = pd.read_csv(osp.join(corpus_dir, 'tags.csv'))
df.head()

Unnamed: 0,id,type,ent_id,ent_lbl,ent_prefid,start_chr,end_chr,start_wrd,end_wrd,text
0,PMC5704053,PROTEIN,,,,15,18,2,3,CD4
1,PMC5704053,IMMUNE_CELL_TYPE,CT30BC86BDEF7B1410,Treg,CTB574584AD019ABB8,26,38,7,9,Regulatory T
2,PMC5704053,IMMUNE_CELL_TYPE,CT30BC86BDEF7B1410,Treg,CTB574584AD019ABB8,125,137,29,31,regulatory T
3,PMC5704053,IMMUNE_CELL_TYPE,CTB574584AD019ABB8,Treg,CTB574584AD019ABB8,145,149,33,34,Treg
4,PMC5704053,IMMUNE_CELL_TYPE,CTB574584AD019ABB8,Treg,CTB574584AD019ABB8,327,331,63,64,Treg


In [18]:
df['type'].value_counts()

PROTEIN                 1606895
CELL_TYPE                884900
CYTOKINE                 484597
IMMUNE_CELL_TYPE         314554
CELL_LINE                260458
TRANSCRIPTION_FACTOR     245369
DNA                      188429
RNA                       21828
Name: type, dtype: int64

In [8]:
(
    df
    .pipe(lambda df: df[df['type'] == ENT_TYP_TF])
    .groupby('ent_lbl').size()
    .sort_values(ascending=False)
    .head(35)
)

ent_lbl
FOXP3     23967
IRF6      16963
TCF23     11217
STAT3      9437
TBX21      9248
MSC        9005
RORC       7044
NR1H4      5626
NR1I3      5278
AHR        5193
GATA3      4762
STAT1      3398
BCL6       2709
PRDM6      2656
MYC        2523
IRF4       2504
TP53       2418
EOMES      2390
PRDM1      2241
MIXL1      2172
SPI1       2016
STAT5A     1879
STAT6      1846
FOXO1      1824
NFATC1     1774
STAT5B     1767
CHAMP1     1760
RUNX1      1739
IRF8       1721
VDR        1706
RELA       1659
STAT4      1604
MAF        1526
JUN        1520
TWIST1     1466
dtype: int64

In [18]:
(
    df
    .pipe(lambda df: df[df['type'] == ENT_TYP_TF])
    .pipe(lambda df: df[df['ent_lbl'] == 'RELA'])
    .assign(text=lambda df: df['text'].str.upper())
    .groupby(['id', 'ent_lbl', 'text'])
    .size().sort_values(ascending=False)   
    .head(15)
)

id          ent_lbl  text
PMC5653749  RELA     P65     95
PMC4734616  RELA     RELA    41
PMC5425596  RELA     P65     40
PMC3959705  RELA     P65     31
PMC2196335  RELA     RELA    25
PMC3201209  RELA     P65     25
PMC4698632  RELA     P65     23
PMC4920892  RELA     RELA    22
PMC5758647  RELA     P65     21
PMC3024268  RELA     P65     21
PMC3911989  RELA     P65     20
PMC4450757  RELA     P65     18
PMC4821646  RELA     RELA    18
PMC4506511  RELA     P65     16
PMC6369217  RELA     P65     16
dtype: int64

In [21]:
#!cat $corpus_dir/docs/PMC4734616.txt