In [1]:
# This cell is only to make it run for Luca, ignore
import sys
sys.path.remove('/home/ldorigo/scripts/biopython')
sys.path.remove('/home/ldorigo/scripts/spaCy')

In [2]:
# imports
import itertools
import argparse
from typing import Generator
from pathlib import Path
from tqdm.std import tqdm
import lib.pipe_pubmed as pipe_pubmed
import lib.pipe_preprocessing as pipe_preprocessing
import lib.pipe_spacy as pipe_spacy
import lib.grammar_analysis as grammar_analysis
import spacy

## Choose the query to submit to pubmed 


In [3]:
query = "spondylarthritis AND \"vitamin D\""

## Setup various generators

In [18]:
total_ids, ids_generator = pipe_pubmed.get_pmids_generator(query=query)

def inner_gen() -> Generator[str, None, None]:
    for progress, id in tqdm(ids_generator, total=total_ids):
        yield id

raw_abstracts_generator = pipe_pubmed.get_raw_abstracts_generator(inner_gen())
ag = pipe_pubmed.get_abstracts_generator(raw_abstracts_generator)


preprocessed_generator = pipe_preprocessing.get_preprocessed_abstracts_generator(ag)

nlp = spacy.load("en_core_sci_md", exclude=["ner"])
raw_docs_generator = pipe_spacy.get_raw_doc_generator(preprocessed_generator, nlp)
docs_generator = pipe_spacy.get_extended_doc_generator(raw_docs_generator)
relevant_sentences_generator = pipe_spacy.get_relevant_sentence_numbers_generator(
    docs_generator
)
doc_relations_generator = grammar_analysis.get_relations_generator(
    relevant_sentences_generator
)


The last iterator contains tuples of (doc, Dict[int, Relations]) - i.e., one Doc corresponding to the abstract, the sentence numbers that contain relations and the relations found in those sentences. Example of how to see them:



In [19]:

for doc, relations_dict in itertools.islice(doc_relations_generator, 100):
    if relations_dict:
        print(f"Abstract: {doc._.title}\n\n")
        # print(doc)
        print(f"\nFound relations:\n")
    for i, relations in relations_dict.items():
        if relations:
            print(f"For sentence \"{list(doc.sents)[i]}\":")
            for relation in relations:
                # I'm using a function to pretty-print the relation, but you should just access the objects directly :-)
                print(grammar_analysis.pretty_print_relation(relation))

 different from those in Western patients. This is the first study showing the differences in dietary habits between psoriatic patients with arthritis and those without. Further studies should elucidate the relationships of these results with skin and joint lesions in psoriatic patients.

Found relations:


For sentence "Some patients are associated with arthritis.":
[[patients] (~)] <-> [[arthritis] (~)]
For sentence "The logistic regression analysis showed that psoriasis was associated with high body mass index and low intake of meat.":
[[psoriasis] (~)] <-> [[body, mass, index] (↑)]
Abstract: Polymorphisms of vitamin D receptor gene in Turkish familial psoriasis patients.


Psoriasis is characterized by hyperproliferation and abnormal differentiation of keratinocytes, and inflammation. 1,25-Dihydroxyvitamin D3, which is used for the treatment of psoriasis, binds to vitamin D receptor (VDR) and modulates gene transcription. We analyzed VDR gene FokI, ApaI and TaqI polymorphisms in 51