# For posterity, these are the attempts at matching concepts in raw text and extracting relations by co-occurrence counts.


In [1]:
import logging
import pprint
import re
import pandas
import math
import numpy as np
import spacy
import pickle

from spacy import displacy, symbols
from IPython.display import display

from os.path import join
from tqdm.auto import tqdm
from QuickUMLS.quickumls import QuickUMLS

tqdm.pandas()
# from utils import quickumls_location_widget, extract_UMLS_types, logger, save_df, load_if_not_present, datalocation_widget, sessionlocation_widget, data_dir
# from prepare_subset import get_pubmed_articles, open_local_article_set, querywidget, downloaded_articles_path, convert_articles_to_dataframe
# from abstract_manipulations import parse_dates, unicode2ascii, load_unicode_mappings, get_full_text, replace_abstract_abbreviations, get_full_text,get_relevant_text
import constants
from abstracts_spacy import set_extensions,ner_on_abstract,ner_on_text

from utils import (
    load_if_not_present,
    extract_UMLS_types,
    QUICKUMLS_LOCATION_WIDGET,
    SESSIONLOCATION_WIDGET,
    get_abstract_relevant_text
)

raw_abstracts = None
timestamped_abstracts = None
ascii_abstracts = None
unabbreviated_abstracts = None
abstracts = None
co_occurrences = None

Created logger with name: utils


## We start by testing QuickUMLS

We first need to choose the semantic types we're interested in. 
Files containing descriptions of the UMLS semantic types can be found at https://semanticnetwork.nlm.nih.gov/download/sn_current.tgz.


We use regular expressions to extract the UIs and descriptions for each category, and store the results in constants.py.

This gives a list of all existing semantic types, which we then manually choose.
For a first experiment, we simply treat all sensible categories as equally relevant. This will very likely lead to a lot of irrelevant/wrong relations, but it's just an experiment.

In [5]:
display(SESSIONLOCATION_WIDGET)

Text(value='small_subset', description='Session location:')

In [7]:
unabbreviated_abstracts = load_if_not_present(
    unabbreviated_abstracts, "unabbreviated_abstracts.csv")
lines, concepts = extract_UMLS_types("/Users/ldorigo/Downloads/2018AA/SU")

display(QUICKUMLS_LOCATION_WIDGET)

Loaded unabbreviated_abstracts.csv from csv.


Text(value='/Users/ldorigo/Documents/QuickUMLS', description='QuickUMLS path:')

In [8]:
matcher = QuickUMLS(QUICKUMLS_LOCATION_WIDGET.value,
                    accepted_semtypes=constants.ACCEPTED_CATEGORIES, overlapping_criteria='score')
nlp = spacy.load("en_core_web_lg", disable=["ner"])

In [9]:
set_extensions()
matcher.set_threshold(0.7)

### Let's run the custom NER function on a couple of abstracts to see how things look like. 



In [10]:
unabbreviated_abstracts = load_if_not_present(
    unabbreviated_abstracts, "unabbreviated_abstracts.csv")
unabbreviated_abstracts["date"] = 0
unabbreviated_abstracts["pmid"] = 0


Reusing previously defined df.


In [11]:
test_row = unabbreviated_abstracts.iloc[16]
test_sent= "Congenital etiology, pre-operative norepinephrine requirement, higher serum C-reactive protein, and central venous pressure were associated with increased in-hospital mortality in children with ventricular assist device support."
# test_doc = ner_on_abstract(test_row,matcher, nlp)
test_doc = ner_on_text(test_sent,matcher,nlp)
displacy.render(test_doc, style="ent")
# for ent in test_doc.ents:
#     print("Entity: {}, \n\t Category: {},\n\t CUI: {}".format(
#         ent, ent.label_, ent._.CUI))

Processing text: Congenital etiology, pre-operative norepinephrine requirement, higher serum C-reactive protein, and central venous pressure were associated with increased in-hospital mortality in children with ventricular assist device support.


We see that there are a large amount of false positives, which would lead to a largely useless network of correlations.
We first try to increase the matcher's accuracy to see if that improves things.

In [12]:
print("Set matcher threshold: {}\n".format(matcher.threshold))

test_highthresh = ner_on_abstract(test_row,matcher, nlp)

ents1 = set(map(lambda x: str(x), test_doc.ents))
ents2 = set(map(lambda x: str(x), test_highthresh.ents))

print("Difference (entities that are not included with the higher accuracy):")

ents1.difference(ents2)

Processing abstract: Body mass index-independent metabolic alterations in narcolepsy with cataplexy.  Metabolic parameters were evaluated by measuring body mass index (body mass index), waist circumference (also with abdominal computed tomography), blood pressure, and daily calorie intake (3-day diary). Chronotypes were assessed through the morningness-eveningness questionnaire. Lumbar puncture for cerebrospinal fluid orexin-A determination and HLA typing were performed. Patients with narcolepsy with cataplexy (all HLA DQB1*0602 positive and with cerebrospinal fluid orexin-A levels < 110 pg/mL) had a higher body mass index and body mass index-independent metabolic alterations, namely waist circumference, high-density lipoprotein cholesterol, and glucose/insulin ratio (an insulin resistance index), with respect to patients with idiopathic hypersomnia without long sleep time (cerebrospinal fluid orexin-A levels > 300 pg/mL). Despite lower daily food intake, patients with narcolepsy with 

Set matcher threshold: 0.7

Difference (entities that are not included with the higher accuracy):


{'C-reactive protein',
 'Congenital',
 'assist',
 'associated',
 'central venous pressure',
 'children',
 'device',
 'increased',
 'norepinephrine',
 'pre',
 'serum',
 'support',
 'ventricular'}

There is still a huge amount of false positives.
Let's see how the scispacy matchers fare.

In [13]:
nlp_sci_sm = spacy.load("en_core_sci_sm")
nlp_sci_md = spacy.load("en_core_sci_md")
nlp_sci_craft = spacy.load("en_ner_craft_md")
nlp_sci_jnlpba = spacy.load("en_ner_jnlpba_md")
nlp_sci_bc5cdr = spacy.load("en_ner_bc5cdr_md")
nlp_sci_bionlp = spacy.load("en_ner_bionlp13cg_md")

OSError: [E050] Can't find model 'en_core_sci_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [14]:
parsers = {
    "nlp_sci_sm": nlp_sci_sm, 
    "nlp_sci_md": nlp_sci_md,
    "nlp_sci_craft": nlp_sci_craft,
    "nlp_sci_jnlpba": nlp_sci_jnlpba,
    "nlp_sci_bc5cdr": nlp_sci_bc5cdr,
    "nlp_sci_bionlp": nlp_sci_bionlp
}

example_abstract = unabbreviated_abstracts.iloc[2]
example_text = get_relevant_text(example_abstract)

for parser in parsers:
    print("\nResult of ner with {}: \n".format(parser))
    
    doc = parsers[parser](example_text)
    
    displacy.render(doc, style='ent',jupyter=True)

NameError: name 'nlp_sci_sm' is not defined

It seems like the baseline model is great at identifying generic entities that could be further identified by quickumls. The bc5cdr model is good for identifying diseases and chemicals, which are interesting for our purpose.



### This is ok for a first shot. Let's apply the entity recognition to our subset of articles.

In [15]:

abstracts = [ner_on_abstract(a,matcher, nlp) for _, a in tqdm(
    unabbreviated_abstracts[0:1000].iterrows(), total=unabbreviated_abstracts[0:1000].shape[0])]

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Processing abstract: Sleep and daytime sleepiness in methylphenidate medicated and un-medicated children with attention-deficit/hyperactivity disorder (attention-deficit/hyperactivity disorder).  Significantly higher levels of daytime sleepiness were reported by the parents of the un-medicated children between the hours of 13:00 and 15:00, compared to the medicated children. The medicated children became increasingly sleepier from the first to the second measurement in both the morning and afternoon. There was no significant difference in the number of sleep disorders/disruptions reported by the parents of either group. In a group of children with attention-deficit/hyperactivity disorder taking methylphenidate, there was a significant increase in sleepiness a few hours after taking the medication, which may then have a significant impact on their learning. The data also imply that part of the mechanism of action of methylphenidate effects in these children may be by reduction of daytim




In [None]:
pickle.dump(abstracts, open("abstracts_with_ner.pickle", "wb+"))

This is very far from optimal, but we focus on extracting relations now.

## Relation Extraction

In [None]:

# cui_all_names = get_all_cui_dict()
unique_cuis = get_unique_cuis(abstracts)
cuis_dict = cuis_to_dict(unique_cuis)
cui_names = get_cui_name_dict(unique_cuis)

The amount of unique CUIs is relatively low for this dataset, so we can afford to make a large dataframe. This will probably become untractable for larger sets. Need to think of a better way.

In [None]:
from itertools import product


def count_co_occurrences_and_occurrences_in_sentences(abstract_list, cuis_dict):
    co_occurrences = np.zeros((len(cuis_dict), len(cuis_dict)), dtype=np.int16)
    occurrences = np.zeros(len(cuis_dict))
    count = 0
    ents = 0
    sents = 0
    for abstract in tqdm(abstract_list):
        for sentence in abstract.sents:
            sents += 1
            cuis = list(map(lambda x: cuis_dict[x._.CUI], sentence.ents))
            ents += len(sentence.ents)
            for cui in cuis:
                occurrences[cui] += 1
            indexes = product(cuis, cuis)
            for index in indexes:
                count += 1
                if index[0] != index[1]:
                    co_occurrences[index[0], index[1]] += 1
    print("Counted {} co-occurrences of {} entities in {} sentences.".format(count, ents, sents))
    return (co_occurrences, occurrences, sents)


def count_co_occurrences_and_occurrences_in_abstracts(abstract_list, cuis_dict):
    co_occurrences = np.zeros((len(cuis_dict), len(cuis_dict)), dtype=np.int16)
    occurrences = np.zeros(len(cuis_dict))
    count = 0
    ents = 0
    for abstract in tqdm(abstract_list):
        cuis = list(map(lambda x: cuis_dict[x._.CUI], abstract.ents))
        ents += len(abstract.ents)
        for cui in cuis:
            occurrences[cui] += 1
        indexes = set(product(cuis, cuis))
        for index in indexes:
            count += 1
            if index[0] != index[1]:
                co_occurrences[index[0], index[1]] += 1
    print("Counted {} co-occurrences of {} entities in {} abstracts.".format(count,
                                                                             ents, len(abstract_list)))
    return (co_occurrences, occurrences, len(abstract_list))


co_occurrences_sent, occurrences_sent, sents_num = count_co_occurrences_and_occurrences_in_sentences(
    abstracts, cuis_dict)
co_occurrences_abs, occurrences_abs, abs_num = count_co_occurrences_and_occurrences_in_abstracts(
    abstracts, cuis_dict)

In [None]:
names = list(map(lambda x: cui_names[x], unique_cuis))

In [None]:
# co_occ_df = pandas.DataFrame(co_occurrences, columns=names)
# co_occ_df.index = names
# co_occ_df['totals'] = co_occ_df.sum()
# co_occ_df.loc['totals'] = pandas.Series(co_occ_df.sum())
# co_occ_df

Let's get a look at what the most commonly co-occurring entities are:

In [None]:
max_co_occ = np.unravel_index(
    np.argmax(co_occurrences_abs, axis=None), co_occurrences_abs.shape)
max_co_occ

In [None]:
co_occ_copy = np.copy(co_occurrences_sent)
print("Most common co-occurring entities in sentences: \n")
for i in range(1, 21):
    (k, l) = np.unravel_index(np.argmax(co_occ_copy, axis=None), co_occ_copy.shape)
    print("Rank {}: {} ({}), {} ({})".format(
        i, unique_cuis[k], names[k], unique_cuis[l], names[l]))
    co_occ_copy[k, l] = 0
    co_occ_copy[l, k] = 0

In [None]:
co_occ_copy = np.copy(co_occurrences_abs)
print("Most common co-occurring entities in abstracts: \n")
for i in range(1, 21):
    (k, l) = np.unravel_index(np.argmax(co_occ_copy, axis=None), co_occ_copy.shape)
    print("Rank {}: {} ({}), {} ({})".format(
        i, unique_cuis[k], names[k], unique_cuis[l], names[l]))
    co_occ_copy[k, l] = 0
    co_occ_copy[l, k] = 0

https://www.aclweb.org/anthology/D11-1098

We compute the probability for each entity to occur in a given sentence.This is equal to (occurrence count)/(amount of sentences). We then compute the probability for every two-word pairs to occur together if they were intependent (that is the null hypothesis).

In [None]:
# This is the probability for each entity to appear on its own in a sentence:
entity_probabilities_sentence = occurrences_sent/sents_num

# This is the expected probability for each word pair to appear in a sentence
independent_co_occurrence_prob = np.outer(
    entity_probabilities_sentence, entity_probabilities_sentence)

# And the expected frequency count for each pair (if the distribution was random):
independent_co_occurence_freq = independent_co_occurrence_prob * sents_num

We now perform a Chi-square test to determine which co-occurrences are significant. (example in https://www.researchgate.net/publication/34442010_The_study_on_automatic_Chinese_collocation_extraction)

In [None]:
# We count only cases where the count is larger than 5 - otherwise chi-square is not appliable.
co_occurrences_sent_relevant = co_occurrences_sent
co_occurrences_sent_relevant[co_occurrences_sent_relevant<15] = 0
diff_count = co_occurrences_sent_relevant - independent_co_occurence_freq

# We are only interested in instances where the frequency is higher than the predicted frequency.
diff_count[diff_count < 0] = 0

square = diff_count*diff_count
statistic = square/independent_co_occurence_freq

In [None]:
co_occurrences_sent[co_occurrences_sent!= 0].size

In [None]:
def print_vals(x,y):
    print("Observed co-occurrence: {} \nPredicted co-occurrence: {} \nSquare of the difference: {}\nStatistic: {}".format(co_occurrences_sent_relevant[x,y],independent_co_occurence_freq[x,y], square[x,y],statistic[x,y]))
    
print_vals(55,795)

We compute the p-value for each of these. 

In [None]:
from scipy import stats

In [None]:
pvals = stats.chi2.sf(statistic,1)

In [None]:
pvals_copy = np.copy(pvals)
print("Most common co-occurring entities in sentences when normalizing for random occurrences: \n")
for i in range(1, 21):
    (k, l) = np.unravel_index(np.argmin(pvals_copy, axis=None), pvals_copy.shape)
    print("Rank {}: {} ({}), {} ({})".format(
        i, unique_cuis[k], names[k], unique_cuis[l], names[l]))
    print("P-value (%d,%d): %.32f" %(k,l,pvals_copy[k, l]))
    pvals_copy[k, l] = float("inf")
    pvals_copy[l, k] = float("inf")

Because of floating-point arithmetic all of these are equal to 0, so we don't really get a representative ranking. Let's rank by the test statistic instead.

In [None]:
csv = pandas.read_csv("./chisquare_adjacency.csv",sep=";")
vals = csv.values
names = vals[:,0]
vals = vals[:,1:]
print("names.shape: {}, vals.shape:{}".format(names.shape, vals.shape))

In [32]:
stat_copy = np.copy(vals)
print("Most common co-occurring entities in sentences when normalizing for random occurrences: \n")
for i in range(1, 21):
    (k, l) = np.unravel_index(np.argmax(stat_copy, axis=None), stat_copy.shape)
    print("Rank {}: {} <-> {} ".format(
        i, names[k], names[l]))
#     print("Chi-square value (%d,%d): %.8f (expexted %f occurrences, got %f)" %(k,l,stat_copy[k, l],independent_co_occurence_freq[k,l],co_occurrences_sent_relevant[k,l]))
    stat_copy[k, l] = 0
    stat_copy[l, k] = 0

Most common co-occurring entities in sentences when normalizing for random occurrences: 

Rank 1: Creatine [Chemical/Ingredient] <-> Phosphocreatine [Chemical/Ingredient] 
Rank 2: Inositol [Chemical/Ingredient] <-> Phosphocreatine [Chemical/Ingredient] 
Rank 3: Amg <-> Phosphocreatine [Chemical/Ingredient] 
Rank 4: Ultraviolet light therapy (regime/therapy) <-> Gifted (observable entity) 
Rank 5: cyclobenzaprine [Chemical/Ingredient] <-> releases 
Rank 6: Apnea [Disease/Finding] <-> Slow shallow breathing (finding) 
Rank 7: Acute brain syndrome (disorder) <-> Injection of therapeutic substance NOS (procedure) 
Rank 8: Vomiting, unspecified <-> olanzapine (Zyprexa) 
Rank 9: serotonins <-> reuptake 
Rank 10: rndx nausea <-> Vomiting, unspecified 
Rank 11: Attention functions, unspecified <-> attention deficit syndrome with hyperactivity 
Rank 12: gamma-Aminobutyric Acid [Chemical/Ingredient] <-> Receptor (substance) 
Rank 13: rndx nausea <-> olanzapine (Zyprexa) 
Rank 14: Injection of th

These are starting to look relatively interesting, however the very low quality of the data (NER) makes them mostly useless.

In [None]:
stat_df = pandas.DataFrame(statistic, columns=names)
stat_df.index = names
stat_df.to_csv("chisquare_adjacency.csv",sep=';')

In [None]:
loaded_csv = pandas.read_csv("chisquare_adjacency.csb")