In [1]:
import sys

In [None]:
# This cell is only to make it run for Luca, ignore
sys.path.remove('/home/ldorigo/scripts/biopython')
sys.path.remove('/home/ldorigo/scripts/spaCy')

In [2]:
sys.path.append('/workspace/medrel')

In [3]:
# imports
import itertools
import argparse
from typing import Generator
from pathlib import Path
from tqdm.std import tqdm
import lib.pipe_pubmed as pipe_pubmed
import lib.pipe_preprocessing as pipe_preprocessing
import lib.pipe_spacy as pipe_spacy
import lib.grammar_analysis as grammar_analysis
import spacy

In [4]:
# Autoreload extension allows to automatically reload modules that are changed (so you don't need to restard jupyter if you change code)

In [5]:
%load_ext autoreload

In [6]:
%autoreload 2

## Choose the query to submit to pubmed 


In [7]:
query = "spondylarthritis AND \"vitamin D\""

## Setup various generators

In [None]:
# Pubmed IDs corresponding to the query:
total_ids, ids_generator = pipe_pubmed.get_pmids_generator(query=query)

# Just to keep track of progress:
def inner_gen() -> Generator[str, None, None]:
    for progress, id in tqdm(ids_generator, total=total_ids):
        yield id
# Get raw abstracts from pubmed
raw_abstracts_generator = pipe_pubmed.get_raw_abstracts_generator(inner_gen())

# Convert them to text + metadata
ag = pipe_pubmed.get_abstracts_generator(raw_abstracts_generator)

# Preprocess (normalize unicode and whitespace and remove brackets)
preprocessed_generator = pipe_preprocessing.get_preprocessed_abstracts_generator(ag)

# Load language model
nlp = spacy.load("en_core_sci_md", exclude=["ner"])

# Parse abstracts into spacy docs
raw_docs_generator = pipe_spacy.get_raw_doc_generator(preprocessed_generator, nlp)

# Add metadata to the doc objects
docs_generator = pipe_spacy.get_extended_doc_generator(raw_docs_generator)

# First pass through the sentence to only consider sentences that may contain a relation
relevant_sentences_generator = pipe_spacy.get_relevant_sentence_numbers_generator(
    docs_generator
)

Make this into a list so you can play around with the parsing without needing to re-download everything each time:

In [None]:
docs_and_sents=  list(relevant_sentences_generator)

And finally, extract relations from those sentences:

In [None]:
doc_relations_generator = grammar_analysis.get_relations_generator(
    docs_and_sents
)

The last iterator contains tuples of `(doc, Dict[int, Relations])` - i.e., one Doc corresponding to the abstract, the sentence numbers that contain relations and the relations found in those sentences. Example of how to see them:



In [None]:
docrels = list(doc_relations_generator)

In [None]:

for doc, relations_dict in docrels:
    if relations_dict:
        print(f"\nAbstract: {doc._.title}")
        # print(doc)
        print(f"\nFound relations:\n")
    for i, relations in relations_dict.items():
        if relations:
            print(f"For sentence \"{list(doc.sents)[i]}\":")
            for relation in relations:
                # I'm using a function to pretty-print the relation, but you should just access the objects directly :-)
                print(grammar_analysis.pretty_print_relation(relation))