In [100]:
import spacy
import pandas as pd
import json
import os
import re
from tqdm.notebook import tqdm
from rake_nltk import Rake
from spacy.matcher import Matcher 
import en_core_web_lg

nlp = spacy.load('en_core_web_lg')


In [101]:
directories = ['./data/biorxiv_medrxiv/', './data/comm_use_subset/','./data/custom_license/', './data/noncomm_use_subset/']

In [102]:
files = []
for directory in directories:
    for (dirpath, dirnames, filenames) in os.walk(directory):
        filenames = [names for names in filenames if '.json' in names]
        if filenames != []:
            files.append({'dirpath':dirpath, 'filenames':filenames})
        

In [103]:
def clean(txt):
    txt=re.sub(r'\n','',txt)
    txt=re.sub(' +', ' ', txt)
    txt=re.sub(',', '', txt)
    txt=re.sub(r'\([^()]*\)','',txt)
    txt=re.sub(r'https?:\S+\sdoi','',txt)
    return txt

In [104]:
def pub_extract(data):
    abstract = ''
    for section in data['abstract']:
        abstract = abstract + '  ' +  section['text']
    abstract = clean(abstract)
    
    text = ''
    for section in data['body_text']:
        text = text + '  ' + section['text']
    text = clean(text)
    ID = data['paper_id']
    title = data['metadata']['title']
    return ID, title, abstract, text



def phrases_extract(text):
    r = Rake()
    r.extract_keywords_from_text(text)
    key_phrase = r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.
    
    return key_phrase

In [105]:
def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""

  #############################################################
  
    for tok in nlp(sent):
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
          # check: token is a compound word or not
          if tok.dep_ == "compound":
            prefix = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                   prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            modifier = tok.text
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                modifier = prv_tok_text + " "+ tok.text

          ## chunk 3
        if tok.dep_.find("subj") == True:
            ent1 = modifier +" "+ prefix + " "+ tok.text
            prefix = ""
            modifier = ""
            prv_tok_dep = ""
            prv_tok_text = ""      

          ## chunk 4
        if tok.dep_.find("obj") == True:
            ent2 = modifier +" "+ prefix +" "+ tok.text

          ## chunk 5  
          # update variables
        prv_tok_dep = tok.dep_
        prv_tok_text = tok.text
  #############################################################

    return [ent1.strip(), ent2.strip()]

In [106]:
def filter_spans(spans):
    """Filter a sequence of spans and remove duplicates or overlaps. Useful for
    creating named entities (where one token can only be part of one entity) or
    when merging spans with `Retokenizer.merge`. When spans overlap, the (first)
    longest span is preferred over shorter spans.
    spans (iterable): The spans to filter.
    RETURNS (list): The filtered spans.
    """
    get_sort_key = lambda span: (span.end - span.start, span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result

In [107]:
def refine_ent(ent, sent):
    unwanted_tokens = (
        'PRON',  # pronouns
        'PART',  # particle
        'DET',  # determiner
        'SCONJ',  # subordinating conjunction
        'PUNCT',  # punctuation
        'SYM',  # symbol
        'X',  # other
        )
    ent_type = ent.ent_type_  # get entity type
    if ent_type == '':
        ent_type = 'NOUN_CHUNK'
        ent = ' '.join(str(t.text) for t in
                nlp(str(ent)) if t.pos_
                not in unwanted_tokens and t.is_stop == False)
    elif ent_type in ('NOMINAL', 'CARDINAL', 'ORDINAL') and str(ent).find(' ') == -1:
        t = ''
        for i in range(len(sent) - ent.i):
            if ent.nbor(i).pos_ not in ('VERB', 'PUNCT'):
                t += ' ' + str(ent.nbor(i))
            else:
                ent = t.strip()
                break
    return ent, ent_type

In [114]:
def entity_pairs(text, coref=True):
    text = re.sub(r'\n+', '.', text)  # replace multiple newlines with period
    text = re.sub(r'\[\d+\]', ' ', text)  # remove reference numbers
    text = nlp(text)
    if coref:
        text = nlp(text._.coref_resolved)  # resolve coreference clusters
    sentences = [sent.string.strip() for sent in text.sents]  # split text into sentences
    ent_pairs = list()
    for sent in sentences:
        sent = nlp(sent)
        spans = list(sent.ents) + list(sent.noun_chunks)  # collect nodes
        spans = filter_spans(spans)
        with sent.retokenize() as retokenizer:
            [retokenizer.merge(span) for span in spans]
        dep = [token.dep_ for token in sent]
        try:
            if (dep.count('obj')+dep.count('dobj'))==1 \
                    and (dep.count('subj')+dep.count('nsubj'))==1:
                for token in sent:
                    if token.dep_ in ('obj', 'dobj'):  # identify object nodes
                        subject = [w for w in token.head.lefts if w.dep_
                                   in ('subj', 'nsubj')]  # identify subject nodes
                        if subject:
                            subject = subject[0]
                            # identify relationship by root dependency
                            relation = [w for w in token.ancestors if w.dep_ == 'ROOT']  
                            if relation:
                                relation = relation[0]
                                # add adposition or particle to relationship
                                if relation.nbor(1).pos_ in ('ADP', 'PART'):  
                                    relation = ' '.join((str(relation),
                                            str(relation.nbor(1))))
                            else:
                                relation = 'unknown'
                            subject, subject_type = refine_ent(subject, sent)
                            token, object_type = refine_ent(token, sent)
                            ent_pairs.append([str(subject), str(relation), str(token),
                                    str(subject_type), str(object_type)])
        except:
            print('error')
    filtered_ent_pairs = [sublist for sublist in ent_pairs
                          if not any(str(x) == '' for x in sublist)]
    pairs = pd.DataFrame(filtered_ent_pairs, columns=['subject',
                         'relation', 'object', 'subject_type',
                         'object_type'])
    return pairs

In [115]:
def get_relation(sent):

    doc = nlp(sent)

    # Matcher class object 
    matcher = Matcher(nlp.vocab)

    #define the pattern 
    pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

    matcher.add("matching_1", None, pattern) 

    matches = matcher(doc)
    k = len(matches) - 1

    span = doc[matches[k][1]:matches[k][2]] 

    return(span.text)

In [116]:
def prepare_df(text_list):
    doc=nlp(text_list)
    df=pd.DataFrame()
    for sent in list(doc.sents):
        sent = clean(str(sent))
        try:
            sub,obj = entity_pairs(str(sent), False)
            relation= get_relation(str(sent))
        except:
            sub = None 
            obj = None
            relation = None
            
        if ((relation not in (None, '')) & (sub not in (None, '')) &(obj not in (None, ''))):
            df=df.append({'subject':sub,'relation':relation,'object':obj},ignore_index=True)
            

    df.drop_duplicates(keep = False, inplace = True)  
    return df

In [117]:
for file in tqdm(files):
    df = pd.DataFrame()
    directory = file['dirpath']
    for filenames in tqdm(file['filenames']):
        with open(directory + '/' + filenames, "r") as read_file:
            data = json.load(read_file)
        
        ID, title, abstract, text = pub_extract(data)
        key_phrases = phrases_extract(abstract)[0:5]
        abstract_tripples = entity_pairs(abstract, False).to_json()
        #text_tripples = entity_pairs(text, False).to_json()
        df=df.append({'ID':ID,
                      'Repo':directory.split('/')[1],
                      'Title':title,
                      'Abstract':abstract,
                      'Text':text,
                      'Key Phrases':key_phrases,
                      'Abs Tri': abstract_tripples },ignore_index=True)   
        
    df.to_csv(directory +'CleanResults_1.1.csv',    index=False, header=False,)
    df.to_pickle(directory +'CleanResults_1.1.pickle')
        
        
        
    
    
        



HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=885.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9118.0), HTML(value='')))

error
error



HBox(children=(FloatProgress(value=0.0, max=16959.0), HTML(value='')))

error
error
error



HBox(children=(FloatProgress(value=0.0, max=2353.0), HTML(value='')))





TODO: 
Update nlp model with:

[1] scispacy

@inproceedings{Neumann2019ScispaCyFA,
  title={ScispaCy: Fast and Robust Models for Biomedical Natural Language Processing},
  author={Mark Neumann and Daniel King and Iz Beltagy and Waleed Ammar},
  year={2019},
  Eprint={arXiv:1902.07669}
}