In [1]:
import pandas as pd
import numpy as np
import os
import regex as re
import ast
from collections import defaultdict
from snorkel.labeling import labeling_function

## Data cleaning

In [2]:
pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_colwidth')

In [3]:
brenda_split6_df = pd.read_csv("../training_data_csvs/output/full_merge_sentence_annots_2_14/sentence_annotations_elsevier_pmid_split6_brenda_data.csv").set_index("pubmedId")
brenda_dois_df = pd.read_csv("../training_data/brenda_reactions_with_dois.csv").set_index("pubmedId")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
brenda_split6_df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'literatureProducts',
       'literatureSubstrates', 'organismSubstrates', 'commentarySubstrates',
       'reversibility', 'organismProducts', 'products', 'commentaryProducts',
       'substrates', 'ecNumber', 'doi', 'lit_id', 'indices', 'start', 'end',
       'sentence', 'sentence_pos', 'enzymes', 'enzyme_locations',
       'chemical_entities_full', 'chemical_names', 'chemical_smiles',
       'name_smile_tuples'],
      dtype='object')

In [5]:
#TAKES A WHILE
hemprot_entities_path = "../scibert/mar12_NER/chemprot_corpus/chemprot_training_entities.tsv"
chemprot_labeled_clean_path = "../scibert/mar12_NER/labeled_chemprot_data_all_cols_clean.csv"

chemprot_entities_df = pd.read_csv(chemprot_entities_path, sep='\t', names = ["doc", "token_id", "entity_type", "start", "end", "entity_name"])
chemprot_entities_df = chemprot_entities_df.set_index(["doc", "entity_name"])

chemprot_word_df = pd.read_csv(chemprot_labeled_clean_path, sep='\t')
chemprot_word_df["start"] = [len(chemprot_word_df["title"].iloc[i]) + chemprot_word_df["start"].iloc[i] + 1 for i in range(len(chemprot_word_df))]
chemprot_word_df["end"] = [len(chemprot_word_df["title"].iloc[i]) + chemprot_word_df["end"].iloc[i] + 1for i in range(len(chemprot_word_df))]
chemprot_sentence_rxn = chemprot_word_df.reset_index().set_index(["doc","word"])
# chemprot_entities_df = chemprot_entities_df.set_index(["doc"])
# chemprot_sentence_rxn = chemprot_sentence_rxn.reset_index().set_index(["doc"])

In [6]:
chemprot_word_df.loc[:100, ["doc", "sentence", "level_1", "spans", "word", "abstract_clean"]]

Unnamed: 0,doc,sentence,level_1,spans,word,abstract_clean
0,7832763,The enzyme cyclo-oxygenase catalyses the oxyge...,920,"(0, 3)",The,The enzyme cyclo-oxygenase catalyses the oxyge...
1,7832763,The enzyme cyclo-oxygenase catalyses the oxyge...,920,"(4, 10)",enzyme,The enzyme cyclo-oxygenase catalyses the oxyge...
2,7832763,The enzyme cyclo-oxygenase catalyses the oxyge...,920,"(11, 26)",cyclo-oxygenase,The enzyme cyclo-oxygenase catalyses the oxyge...
3,7832763,The enzyme cyclo-oxygenase catalyses the oxyge...,920,"(27, 36)",catalyses,The enzyme cyclo-oxygenase catalyses the oxyge...
4,7832763,The enzyme cyclo-oxygenase catalyses the oxyge...,920,"(37, 40)",the,The enzyme cyclo-oxygenase catalyses the oxyge...
...,...,...,...,...,...,...
96,7832763,hCOX-1 had a specific activity of 18.8 mumol o...,920,"(574, 578)",13.8,The enzyme cyclo-oxygenase catalyses the oxyge...
97,7832763,hCOX-1 had a specific activity of 18.8 mumol o...,920,"(579, 585)",microM,The enzyme cyclo-oxygenase catalyses the oxyge...
98,7832763,hCOX-1 had a specific activity of 18.8 mumol o...,920,"(586, 589)",for,The enzyme cyclo-oxygenase catalyses the oxyge...
99,7832763,hCOX-1 had a specific activity of 18.8 mumol o...,920,"(590, 602)",arachidonate,The enzyme cyclo-oxygenase catalyses the oxyge...


In [7]:
len(chemprot_word_df), len(chemprot_entities_df)

(37802, 25752)

In [8]:
s1 = chemprot_sentence_rxn.index
s2 = chemprot_entities_df.index
pd.Series(list(set(s2).intersection(set(s1)))).loc[:4]

0        (19429089, serotonin)
1       (23221006, phenformin)
2       (23221006, Phenformin)
3    (23318471, 17β-estradiol)
4               (9928259, NET)
dtype: object

In [9]:
# save merged chemprot_sentence_rnx and chemprot_entities_df
merged_chemprot_annot = pd.merge(chemprot_sentence_rxn, chemprot_entities_df, how = "left", left_index = True, right_on = ["doc","entity_name"])
merged_chemprot_annot.to_csv("chemprot_annotations_merged.csv")

### clean duplicate words from merge, repeated nonsense sentences, and null values 

In [10]:
merged_chemprot_annot_strings = merged_chemprot_annot.loc[:,["token_id","entity_type", "label", "sentence_clean"]].fillna("null")

In [11]:
#clean duplicate word-level entities that appear due to the weird merge
merged_chemprot_entities = merged_chemprot_annot_strings.reset_index()["entity_name"]
bool_duplicate_entity = pd.Series([True] + [merged_chemprot_entities[ind] != merged_chemprot_entities[ind-1] for ind in range(1, len(merged_chemprot_entities))])
merged_chemprot_cleaned = merged_chemprot_annot_strings.reset_index()[bool_duplicate_entity]
merged_chemprot_cleaned["doc"] = merged_chemprot_cleaned["doc"].apply(str)

In [12]:
def clean_redundant_sentence_noise(entry):
    # remove any sentence with multiple docIDs from the grouped-by-sentence dataframe;
    # these are non-unique sentences in Chemprot, ie "2." or "(2004) J. Biol."
    list_doc_ids = entry.split(", ")
    return 1 == len(set(list_doc_ids))

In [13]:
# def remove_null(series_entry):
#     list_series_entry = series_entry.split(", ")
#     cleaned_ls = [entry for entry in list_series_entry if entry != "null"]
#     if len(cleaned_ls) > 0:
#         return ", ".join(cleaned_ls)
#     return np.nan

def extract_chemical_entities_indices(series_entry, query):
    list_series_entry = series_entry.split(", ")
    indexes = []
    for index in range(len(list_series_entry)):
        entry = list_series_entry[index]
        if query in entry:
            indexes.append(index)
    if len(indexes) > 0:
        return indexes
    return np.nan
def get_entities_series_from_index_series(index_series, sentence_series):
    sentence_series = sentence_series.apply(lambda x: x.split(", "))
    rv_list = []
    for entry_ind in range(len(index_series)):
        list_entry = index_series.iloc[entry_ind]
        entities_list = []
        if type(list_entry) == list:
            for entity_ind in list_entry:
                entities_list.append(str(sentence_series.iloc[entry_ind][entity_ind]))
        if len(entities_list) > 0:
            entities_str = ', '.join(entities_list)
            rv_list.append(entities_str)
        else:
            rv_list.append(np.nan)
    return pd.Series(rv_list)
def extract_labeled_entities_dict(series_entry):
    list_series_entry = series_entry.split(", ")
    indexes = []
    dict_rv = {}
    for index in range(len(list_series_entry)):
        entry = list_series_entry[index]
        if entry != "O":
            indexes.append(index)
    if len(indexes) > 0:
        for index in indexes:
            dict_rv[index] = list_series_entry[index]
        return dict_rv
    return np.nan
def extract_docID(doc_string_comma_sep):
    list_series_entry = doc_string_comma_sep.split(", ")
    # add ".0" to end of docID string to match brenda data index
    return float(list_series_entry[0])

In [14]:
# clean_redundant_sentence_noise removes the following sentences: ("2.", "3.", "5.", "Abstract 1.")
chemprot_annot_sentence = merged_chemprot_cleaned.reset_index().groupby("sentence_clean").agg(lambda x : ", ".join(x))
chemprot_annot_sentence = chemprot_annot_sentence[chemprot_annot_sentence["doc"].apply(clean_redundant_sentence_noise)]

#reset_index to make adding series easy
chemprot_annot_sentence = chemprot_annot_sentence.reset_index()

#extract chemicals and enzymes from entity_type
chemprot_annot_sentence["chemical_ind"] = chemprot_annot_sentence["entity_type"].apply(lambda x: extract_chemical_entities_indices(x, "CHEMICAL"))
chemprot_annot_sentence["gene_ind"] = chemprot_annot_sentence["entity_type"].apply(lambda x: extract_chemical_entities_indices(x, "GENE"))
chemprot_annot_sentence["chemicals"] = get_entities_series_from_index_series(chemprot_annot_sentence["chemical_ind"], chemprot_annot_sentence["entity_name"])
chemprot_annot_sentence["genes"] = get_entities_series_from_index_series(chemprot_annot_sentence["gene_ind"], chemprot_annot_sentence["entity_name"])

# extract substrates, product-of, enzyme
chemprot_annot_sentence["substrate_ind"] = chemprot_annot_sentence["label"].apply(lambda x: extract_chemical_entities_indices(x, "SUBSTRATE"))
chemprot_annot_sentence["product_ind"] = chemprot_annot_sentence["label"].apply(lambda x: extract_chemical_entities_indices(x, "PRODUCT"))
chemprot_annot_sentence["enzyme_ind"] = chemprot_annot_sentence["label"].apply(lambda x: extract_chemical_entities_indices(x, "enzyme"))

chemprot_annot_sentence["substrates"] = get_entities_series_from_index_series(chemprot_annot_sentence["substrate_ind"], chemprot_annot_sentence["entity_name"])
chemprot_annot_sentence["products"] = get_entities_series_from_index_series(chemprot_annot_sentence["product_ind"], chemprot_annot_sentence["entity_name"])
chemprot_annot_sentence["enzymes"] = get_entities_series_from_index_series(chemprot_annot_sentence["enzyme_ind"], chemprot_annot_sentence["entity_name"])

#extract labels: dict between word index and label
chemprot_annot_sentence["label_dict"] = chemprot_annot_sentence["label"].apply(extract_labeled_entities_dict)

#reindex on doc (unique paper-level ID)
chemprot_annot_sentence['doc'] = chemprot_annot_sentence['doc'].apply(extract_docID)
chemprot_annot_sentence = chemprot_annot_sentence.reset_index().set_index("doc")

#rename and drop columns
chemprot_annot_sentence = chemprot_annot_sentence.rename(columns = {"sentence_clean": "sentence", "entity_name": "sentence entities", "label_dict": "label index dict"})
chemprot_annot_sentence = chemprot_annot_sentence.drop(["index", "token_id", "sentence entities", "chemical_ind", "gene_ind", "substrate_ind", "product_ind", "enzyme_ind"], axis = 1)

In [16]:
# helper function for abstract-sentence featurization
def sort_list_match_ref(abstract_ref, extracted_s):
    rv = []
    for i in range(len(abstract_ref)):
        text = abstract_ref.iloc[i]
        extracted_ls = extracted_s.iloc[i]
        try:
            rv.append(sorted(extracted_ls, key = lambda x: text.index(x)))
        except:
            print("Extracted sentence in doc " + str(ref.index[i]) + " does not match the abstract.")
    return pd.Series(rv, index = ref.index)
def get_sentence_index(sentence_s, sentence_list_s):
    rv = []
    for i in range(len(sentence_s)):
        sentence_list = sentence_list_s.iloc[i]
        sentence = sentence_s.iloc[i]
        rv.append(sentence_list.index(sentence))
    return pd.Series(rv, index = sentence_s.index)
#sort_list_match_ref(ref, abstract_expand)
#get_sentence_index(sentence_s,abstract_ordered)

In [17]:
chemprot_doc_abstract = pd.DataFrame(chemprot_word_df.loc[:,["doc", "abstract_clean"]].drop_duplicates().groupby("doc")["abstract_clean"].sum()).reset_index()
chemprot_abstract_annot = pd.merge(chemprot_annot_sentence, chemprot_doc_abstract, how = "left", left_index = True, right_on = ["doc"]).set_index("doc").sort_index()
expanded_abstract_df = pd.DataFrame(chemprot_abstract_annot.groupby("doc")['sentence'].apply(list)).rename(columns = {"sentence": "abstract_expand"})

chemprot_final_annot = pd.merge(chemprot_abstract_annot, expanded_abstract_df, how = "left", left_index = True, right_on = ["doc"])
# define parameters for featurization helper functions
ref = chemprot_final_annot.loc[:,"abstract_clean"]
abstract_expand = chemprot_final_annot.loc[:, "abstract_expand"]
abstract_ordered = pd.Series(sort_list_match_ref(ref, abstract_expand))
sentence_s = chemprot_final_annot.loc[:, "sentence"]
# add final features to define sentence positions within abstracts
chemprot_final_annot["abstract_ordered"] = abstract_ordered
chemprot_final_annot["sentence_index"] = get_sentence_index(sentence_s, abstract_ordered)
chemprot_final_annot = chemprot_final_annot.reset_index().set_index(["doc", "sentence_index"]).sort_index()
chemprot_final_annot.iloc[:4,:]

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence,entity_type,label,chemicals,genes,substrates,products,enzymes,label index dict,abstract_clean,abstract_expand,abstract_ordered
doc,sentence_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
7832763,0,The enzyme cyclo-oxygenase catalyses the oxyge...,"null, null, GENE-N, null, null, null, null, nu...","O, O, B-enzyme, O, O, O, O, B-SUBSTRATE, I-SUB...",prostaglandins,cyclo-oxygenase,"arachidonic, acid",prostaglandins,cyclo-oxygenase,"{2: 'B-enzyme', 7: 'B-SUBSTRATE', 8: 'I-SUBSTR...",The enzyme cyclo-oxygenase catalyses the oxyge...,[The enzyme cyclo-oxygenase catalyses the oxyg...,[The enzyme cyclo-oxygenase catalyses the oxyg...
7832763,1,Recently two forms of cyclo-oxygenase have bee...,"null, null, null, null, GENE-N, null, null, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O...",,"cyclo-oxygenase, COX-1, COX-2, cytokines",,,,,The enzyme cyclo-oxygenase catalyses the oxyge...,[The enzyme cyclo-oxygenase catalyses the oxyg...,[The enzyme cyclo-oxygenase catalyses the oxyg...
7832763,2,Constitutive and inducible forms of human cycl...,"null, null, null, null, null, null, GENE-N, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O...",,"cyclo-oxygenase, hCOX-1, hCOX-2",,,,,The enzyme cyclo-oxygenase catalyses the oxyge...,[The enzyme cyclo-oxygenase catalyses the oxyg...,[The enzyme cyclo-oxygenase catalyses the oxyg...
7832763,3,hCOX-1 had a specific activity of 18.8 mumol o...,"GENE-Y, null, null, null, null, null, null, nu...","B-enzyme, O, O, O, O, O, O, O, O, O, O, O, O, ...",arachidonate,hCOX-1,arachidonate,,hCOX-1,"{0: 'B-enzyme', 17: 'B-SUBSTRATE'}",The enzyme cyclo-oxygenase catalyses the oxyge...,[The enzyme cyclo-oxygenase catalyses the oxyg...,[The enzyme cyclo-oxygenase catalyses the oxyg...


In [23]:
# save sentence level chemprot data that is annotated to resemble brenda data
chemprot_final_annot.to_csv("chemprot_sentence_level_cleaned.csv")

## brenda data exploration

In [18]:
s1 = chemprot_annot_sentence.index
s2 = brenda_split6_df.index
pd.Series(list(set(s2).intersection(set(s1)))).loc[:4]

0    12067524.0
dtype: float64

In [19]:
brenda_split6_df.loc[:, ["sentence", "indices", "substrates", "products", "chemical_names", "enzymes"]].iloc[:5]

Unnamed: 0_level_0,sentence,indices,substrates,products,chemical_names,enzymes
pubmedId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18585373.0,serial JL 272543 291210 291734 291852 291854 3...,0,more,?,,
18585373.0,All rights reserved.,1,more,?,,
18585373.0,RHOGTPASECONTROLSINVAGINATIONCOHESIVEMIGRATION...,2,more,?,,"GTPase, GTPase"
18585373.0,Rho1 K heterozygous salivary glands invaginate...,3,more,?,,
18585373.0,Rho1 K homozygous glands begin to invaginate (...,4,more,?,,


In [20]:
brenda_important_features = brenda_split6_df.loc[:, ["sentence", "indices", "substrates", "products", "chemical_names", "enzymes"]].fillna("null").reset_index()
brenda_important_features["pubmedId"] = brenda_important_features["pubmedId"].apply(str)
brenda_important_features["indices"] = brenda_important_features["indices"].apply(str)


In [21]:
#groupby  duplicate sentences
brenda_index_sentence_per_article = brenda_important_features.set_index(["pubmedId", "sentence"])

In [22]:
#TAKES A WHILE
brenda_sentences_clean = brenda_index_sentence_per_article.groupby(["pubmedId", "sentence"]).agg(lambda x : ", ".join(x))

KeyboardInterrupt: 

In [None]:
brenda_sentences_clean

In [None]:
brenda_split6_df.columns

In [None]:
#utilities
def get_df_one_pmid(df, pmid):
    return df.loc[pmid]
def get_df_random_pmid(df):
    pmid = df.sample(1).index[0]
    return df.loc[pmid]
def get_sentence_chemicals_df(df):
    return df[["sentence", "indices", "substrates", "products", "chemical_names", "enzymes"]]
def common_value_counts(s):
    rv = s.value_counts()[:10].to_dict()
    rv["null"] = sum(s.isnull())
    return rv

In [None]:
common_value_counts(get_df_one_pmid(brenda_split6_df, 24853758.0)["products"])

In [None]:
# helper functions - return true or false if condition is met
def contains_chemical(df, iloc_index):
    chem_name = df.iloc[iloc_index]["chemical_names"]
    if type(chem_name) == str:
        print("is a chemical")
        return True
    print("is not a chemical")
    return ~np.isnan(df.iloc[iloc_index]["chemical_names"])
def nearby_enzyme(df, iloc_index, search_window):
    #search for enzymes in nearby sentences
    pmid = df.index[iloc_index]
    sentence_index = df.iloc[iloc_index]["indices"]
    article_df = df.loc[pmid].set_index("indices")
    #print(sentence_index)
    indices_list_uncleaned = list(range(sentence_index-search_window, sentence_index + search_window + 1))
    indices_list = [entry for entry in indices_list_uncleaned if entry >= 0 and entry < len(df)]
    #print(indices_list_uncleaned)
    search_df = article_df.loc[indices_list]
    nearby_enzymes_list = list(search_df["enzymes"])
    
    print(nearby_enzymes_list)
    
    enzyme_nearby_bool = (sum([type(enzyme) != float for enzyme in nearby_enzymes_list]) != 0)
    return enzyme_nearby_bool
def nearby_enzyme_v2(df, iloc_index, search_window):
    #search for enzymes in nearby sentences
    pmid = df.index[iloc_index]
#     sentence_index = df.iloc[iloc_index]["indices"]
#     article_df = df.loc[pmid].set_index("indices")
    #print(sentence_index)
    indices_list_uncleaned = list(range(iloc_index-search_window, iloc_index + search_window + 1))
    indices_list = [entry for entry in indices_list_uncleaned if entry >= 0 and entry < len(df)]
    #print(indices_list_uncleaned)
    search_df = df.iloc[indices_list]
    nearby_enzymes_list = list(search_df["enzymes"])
    
    print(nearby_enzymes_list)
    
    enzyme_nearby_bool = (sum([type(enzyme) != float for enzyme in nearby_enzymes_list]) != 0)
    return enzyme_nearby_bool

In [None]:
brenda_split6_df.iloc[123550:123560, :]
#brenda_split6_df.iloc[1:4, :]

In [None]:
nearby_enzyme_v2(brenda_split6_df, 123554, 3)

In [None]:
# Define the Label Mapping
# return True if the sentence contains positive structrues 
# e.g, A reacts with B. 'react' / 'reacts'
TRUE = 1 

# return False if the sentence contains negative structures 
# e.g, A cannot react with B. 'cannot'
FALSE = 1

# return Abstain whenever the sentence does not meet the LABEL function
# def find_verb(s), find verbs such as react, oxidize. 
# find_verb('A and B are good candidates.') will return ABSTAIN because it doesn't match the pattern
ABSTAIN = -1 

In [None]:
tempind = 12425
sw=2
brenda_split6_df.iloc[tempind]["sentence"], brenda_split6_df.iloc[tempind]["chemical_names"], nearby_enzyme(brenda_split6_df, tempind, sw)

In [None]:
# labeling functions
@labeling_function()
def has_chemical_and_nearby_enzyme(sentence):
    df = brenda_split6_df
    iloc_index = 12423
    search_window = 100
    if contains_chemical(df, iloc_index) and nearby_enzyme(df, iloc_index, search_window):
        print("it's a hit!")
        return TRUE
    else:
        return ABSTAIN

In [None]:
df = brenda_split6_df
sentence = df.iloc[iloc_index]["sentence"]
sentence
has_chemical_and_nearby_enzyme(sentence)

In [None]:
brenda_split6_df.sample(10, random_state=3)[["sentence", "sentence_pos", "substrates", "products", "chemical_names", "enzymes"]]