In [9]:
import os
import tqdm
import os.path as osp
import pandas as pd
import spacy
%run env.py
%run src/lib.py
%run src/integration.py
fix_jupyter_spacy_config()
corpus_dir = osp.join(DATA_DIR, 'articles', 'corpus', 'corpus_00')
corpus_docs_path = osp.join(corpus_dir, 'docs')

In [2]:
nlp = get_scispacy_pipeline()
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x112864588>),
 ('parser', <spacy.pipeline.DependencyParser at 0x122626780>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x12d4d20f8>)]

In [3]:
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

if not Span.has_extension('ent_typ_id'):
    Span.set_extension('ent_typ_id', default=None)
if not Span.has_extension('ent_typ_lbl'):
    Span.set_extension('ent_typ_lbl', default=None)

class EntityMatcher(object):
    
    def __init__(self, nlp, terms, label):
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        # Loop through tokens and do exact lookup on normalized, known labels
        matches = self.matcher(doc)
        spans = []
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            spans.append(span)
        doc.ents = list(doc.ents) + spans
        return doc 
    
def _get_matcher_terms(values):
    return [v for s in values for v in [s.lower(), s.upper(), s.title(), s]]

class EntityTagger(object):

    def __init__(self, nlp, entity_type, label):
        df = get_entity_meta_data(entity_type)
        self.matcher = EntityMatcher(nlp, df['sym'].unique(), label)
        self.label = nlp.vocab.strings[label]
        self.entity_type = entity_type
        self.df = df.set_index('sym')

    def __call__(self, doc):
        # Attach metadata properties to matched entities
        doc = self.matcher(doc)
        for ent in doc.ents:
            if ent.label != self.label:
                continue
            #print(self.entity_type, ent.text, ent.label)
            r = self.df.loc[ent.text]
            ent._.ent_typ_id = r['id']
            ent._.ent_typ_lbl = r['lbl']
        return doc 
    
class EntityRefiner(object):
    name = 'refiner'
    
    def __init__(self, nlp):
        self.ck_tagger = EntityTagger(nlp, 'cytokines', 'CYTOKINE')
        self.tf_tagger = EntityTagger(nlp, 'transcription_factors', 'TF')
        self.ct_tagger = EntityTagger(nlp, 'cell_types', 'CELL_TYPE')
        
    def __call__(self, doc):
        # Reset entities
        doc.ents = list()
        doc = self.ck_tagger(doc)
        doc = self.tf_tagger(doc)
        doc = self.ct_tagger(doc)
        return doc 

if nlp.has_pipe(EntityRefiner.name):
    nlp.remove_pipe(EntityRefiner.name)
nlp.add_pipe(EntityRefiner(nlp), last=True)

In [4]:
#nlp.pipeline[-1][1].ck_tagger.df.loc['TGF-β']

In [5]:
# with open(osp.join(output_corpus_dir, 'PMC3925027.txt'), 'r') as fd:
#     doc = fd.read()
#doc = nlp(doc)
#spacy.displacy.render(doc, style='ent', jupyter=True)

In [6]:
# for t in doc:
#     if 'IL-10' in t.text:
#         print(t)

In [7]:
from spacy.tokens import Doc
if not Doc.has_extension('pmc_id'):
    Doc.set_extension('pmc_id', default=None)
    
def get_docs():
    files = []
    for f in os.listdir(corpus_docs_path):
        if not f.endswith('.txt'):
            continue
        files.append(osp.join(corpus_docs_path, f))
    for f in tqdm.tqdm(files):
        with open(f, 'r') as fd:
            doc = nlp(fd.read())
            doc._.pmc_id = osp.basename(f).replace('.txt', '')
            yield doc
docs = [d for d in get_docs()]

100%|██████████| 500/500 [05:41<00:00,  1.46it/s]


In [11]:
from collections import OrderedDict
df_exp = pd.DataFrame([
    OrderedDict(
        id=doc._.pmc_id, type=ent.label_, 
        ent_typ_id=ent._.ent_typ_id, ent_typ_lbl=ent._.ent_typ_lbl,
        start_chr=ent.start_char, end_chr=ent.end_char, 
        start_wrd=ent.start, end_wrd=ent.end, 
        text=ent.text
    ) 
    for doc in docs 
    for ent in doc.ents
])
df_exp.head()

Unnamed: 0,id,type,ent_typ_id,ent_typ_lbl,start_chr,end_chr,start_wrd,end_wrd,text
0,PMC5743442,CELL_TYPE,4,Th17,65,69,8,9,Th17
1,PMC5743442,CELL_TYPE,4,Th17,87,91,11,12,Th17
2,PMC5743442,CYTOKINE,81,TGF-β1,174,179,25,26,TGF-β
3,PMC5743442,CELL_TYPE,4,Th17,199,203,29,30,Th17
4,PMC5743442,CYTOKINE,81,TGF-β1,275,280,42,43,TGF-β


In [12]:
path = osp.join(corpus_dir, 'tags.csv')
df_exp.to_csv(path, index=False)
path

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/corpus/corpus_00/tags.csv'