## iGEM labeling functions

## example

In [1]:
import pandas as pd
import numpy as np
import os
import nltk
import regex as re
import ast
from collections import defaultdict
from snorkel.labeling import labeling_function

### test data written by hand

In [2]:
test_data = [
    [["carbon", "oxygen"], "carbon was oxidized by the oxygen"],
    [["carbon", "oxygen", "amino acid", "cassie"], "the cassie amino acid is connected to the carbon which was oxidixed by the oxygen inside the dna"],
    
    [["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."],
    [["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."],
    [["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."],
    [["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."]
]

### setting up abstain

In [3]:
ABSTAIN = -1

### labeling functions + small tests written

In [4]:
# LF_solution_words
# If any of the words is in a list of solution terms, we label FALSE
# https://www.hach.com/chemGlossary   
solution_terms = ["buffer", "diluent", "solute", "solvent", "saturated", "unsaturated", "saturating"]
@labeling_function()
def solution_words(x):
    for word in solution_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [5]:
# LF_mixture_words
# If any of the words is in a list of mixture terms, we label FALSE
# https://www.hach.com/chemGlossary   
mixture_terms = ["suspended", "mixture", "heterogenous", "homogeneous"]
@labeling_function()
def mixture_words(x):
    for word in mixture_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [6]:
# LF_physical_words
# If there are common physical terms, we label FALSE
physical_terms = ["purified", "detection", "composed", "weight", "characteristic", "metal",
    "characterization", "color", "metalic", "characterized", "character", "consists", "pure", "compose", "assay",
    "permeable", "signal", "bind", "property"]
@labeling_function()
def physical_words(x):
    for word in physical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [7]:
# LF_gene_words
# If one of the chemicals is in a list of genetic terms such as reductase, dna, gene, we label FALSE
genetic_terms = ["reductase", "dna", "gene", "allele", "locus", "genotype", "phenotype", "dominant", "recessive", "additive", "phenoset",
    "diallelic", "multiallelic", "polyallelic", "monomorphic", "monoallelic", "polymorphism", "mutation", "complex", "trait", "multifactorial",
    "polygenic", "monogenic", "mixed model", "transmission probability", "transition probability", "epistasis", "interaction", "pleiotropy",
    "quantitative trait locus", "probit", "logit", "penetrance", "transformation", "scale of measurement", "identity by descent", "identity in state",
    "haplotype", "phase", "multilocus", "genotype", "allelic", "association", "linkage", "disequilibrium", "gametic", "rna"]
@labeling_function()
def gene_words(x):
    for word in genetic_terms:
        if (word in x[0]):
            return False
    return ABSTAIN
    

In [8]:
# LF_structural_words
# If there are common structural terms, we label FALSE
structural_terms = ["loop", "sequence", "encodes", "code", "codon", "dna", "rna", "pair", "group", "active site",
    "bond", "chain", "gene", "structure", "structural", "encoding", "cdna", "cluster"]
@labeling_function()
def structural_words(x):
    for word in structural_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [9]:
# LF_general_chemical
# If any of the words is in a list of common chemical terms such as amino acid, sugar, adenosine, amide, adenine, etc..., we label FALSE
# https://www.hach.com/chemGlossary   
chemical_terms = ["amino acid", "sugar", "adenosine", "amide", "adenine", "algaecide", "amines", "base", "biocides",
    "clarifier", "hydrocarbon", "hydrogenation", "molecule", "nutrients", "polymer", "peptide", "polypeptide",
    "tag", "functional", "residue", "activity", "enzyme", "electron", "cofactor"]
@labeling_function()
def general_chemical(x):
    for word in chemical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [10]:
# LF_group
# If there is a close mention of a functional chemical group, we label FALSE (as it is more likely descriptive of a structure than of a reaction)
# https://www.masterorganicchemistry.com/2010/10/06/functional-groups-organic-chemistry/
common_functional_groups = ["alkane", "alkene", "alkyne", "benzene ring", "phenyl", "amine", "alcohol", "ether", "alkyl halide", "thiol",
    "aldehyde", "ketone", "ester", "carboxylic acid", "amide", "nitrile", "epoxide", "disulfide", "imine", "acid chloride", "anhydride", "nitro",
    "sulfide", "thioether", "group", "functional"]
@labeling_function()
def group(x):
    for group in common_functional_groups:
        if (group in x[0]):
            return False
    return ABSTAIN


In [11]:
# LF_includes_amino_acids
# If there is an amino acid mentioned, we label FALSE
amino_acids = ['val', 'ile', 'leu', 'glu', 'gln', \
    'asp', 'asn' 'his' 'trp', 'phe', 'tyr',    \
    'arg', 'lys', 'ser', 'thr', 'met', 'ala',    \
    'gly', 'pro', 'cys', "amino"]
@labeling_function()
def includes_amino_acids(x):
    for group in amino_acids:
        if (group in x[0]):
            return False
    return ABSTAIN


In [12]:
# LF_paper_artifacts
# If there are common words from paper headers/footers, we label FALSE
common_terms = ["university", "univ", "pharma", "avenue", "street", "road", "department", "usa", "reference", "ref",
    "keyword", "article", "http", "png", "jpg"]
@labeling_function()
def paper_artifacts(x):
    for word in common_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [13]:
# LF_no_terms
# If there are words stopping or with no change, we label FALSE
common_no_terms = ["inactivated", "unaffected", "inactive", "inactivates", "stops", "prevent", "inhibit",
    "denature", "block"]
@labeling_function()
def no_terms(x):
    for word in common_no_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [14]:
# LF_adjacent_mentions
# If the chemicals are adjacent, we label FALSE
@labeling_function()
def adjacent_mentions(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            if ((x[1][index] + " " + x[1][index_2]) in x[0]):
                return False
    return ABSTAIN

In [15]:
# LF_sep_or
# If the chemicals are separated by or, we label FALSE
@labeling_function()
def sep_or(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            if (x[1][index] + " or " + x[1][index_2] in x[0]):
                return False
    return ABSTAIN

In [16]:
# LF_sep_and
# If the chemicals are separated by and, we label FALSE
@labeling_function()
def sep_and(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            if (x[1][index] + " and " + x[1][index_2] in x[0]):
                return False
    return ABSTAIN

In [17]:
# LF_sep_comma
# If the chemicals are separated by a comma, we label FALSE
@labeling_function()
def sep_comma(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            if (x[1][index] + ", " + x[1][index_2] in x[0]):
                return False
    return ABSTAIN

In [18]:
# LF_sep_via
# If the chemicals are separated by via, in, etc, we label FALSE
common_via_words = ["via", "in"]
@labeling_function()
def sep_via(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            for term in common_via_words:
                if (x[1][index] + " " + term + " " + x[1][index_2] in x[0]):
                    return False
    return ABSTAIN

In [19]:
# LF_sep_sym
# If the chemicals are separated by a single character, we label FALSE  
@labeling_function()
def sep_sym(x):
    for index in range(len(x[1])):
        chem_1_index = x[0].find(x[1][index])
        chem_1_len = len(x[1][index])
        while (0 <= chem_1_index < len(x[0])):
            for index_2 in range(len(x[1])):
                chem_2_index = x[0].find(x[1][index_2])
                while (0 <= chem_2_index < len(x[0])):
                    if (sep_sym_helper(chem_1_index, chem_1_len, chem_2_index)):
                        return False
                    chem_2_index = x[0].find(x[1][index_2], chem_2_index + 1)
            chem_1_index = x[0].find(x[1][index], chem_1_index + 1)
    return ABSTAIN

def sep_sym_helper(index_1, length, index_2):
    if (index_1 + length + 1== index_2):
        return True
    return False

In [20]:
# testing separated by a single character (ensuring it works with repeated chemicals)
sep_sym(["cassie is here and her name cassie cas is cas cassie", ["cassie", "cas"]])

False

In [21]:
# LF_followed_ase
# If one of the chemicals is followed by a word that ends with -ase, we label FALSE
@labeling_function()
def followed_ase(x):
    sentence = x[0].replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    for chem in x[1]:
        index = -1
        while (index < len(words)):
            if not(chem in words[index + 1:]):
                index = len(words)
                break
            if (" " in chem):
                chem_split = chem.split(" ")
                index = words.index(chem_split[-1], index + 1)
            else:
                index = words.index(chem, index + 1)
            if (0 < index + 1 < len(words) and words[index + 1][-3:] == "ase"):
                return False
    return ABSTAIN

In [22]:
# test followed_ase (making sure it works if the chem is repeated)
followed_ase(["carbon fiber and carbon lactase with oxygen", ["carbon", "oxygen"]])

False

In [23]:
# LF_followed_by_noun
# If one of the chemicals is followed by a noun, we label FALSE
@labeling_function()
def followed_by_noun(x):
    sentence = x[0].replace(',', '')
    words = sentence.split(" ")
    tagged = nltk.pos_tag(words)
    for chem in x[1]:
        index = -1
        while (index < len(words)):
            if not(chem in words[index + 1:]):
                index = len(words)
                break
            if (" " in chem):
                chem_split = chem.split(" ")
                index = words.index(chem_split[-1], index + 1)
            else:
                index = words.index(chem, index + 1)
            if (index + 1 < len(words)):
                if (tagged[index + 1][1] == "NN"):
                    return False
    return ABSTAIN

In [24]:
# testing followed_by_noun (ensuring it works with repeats)
followed_by_noun(["cassie eating apple, eating for cassie table", ["cassie", "apple"]])

False

In [25]:
# test cases
print(followed_by_noun(["The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity.", ["L-GalDH"]]))
print(followed_by_noun(["Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy.", ["L-GalDH"]]))
print(followed_by_noun(["We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000).", ["L-GalL dehydrogenase"]]))
print(followed_by_noun(["The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL.", ["L-GalDH", "dehydro-AsA", "L-GalL"]]))

False
False
-1
False


In [26]:
# LF_sep_verb
# If the chemicals are separated by a verb, we label TRUE
@labeling_function()
def sep_verb(x):
    sentence = x[0].replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in x[1]:
        index = -1
        while (index < len(words)):
            if (" " in chem):
                chem_split = chem.split(" ")
                if not(chem_split[-1] in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem_split[-1], index + 1)
            else:
                if not(chem in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem, index + 1)
            indexes.append(index)
    if not indexes:
        return ABSTAIN
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "VB"):
            return True
    return ABSTAIN

In [27]:
# testing sep_verb (works when there are several instances)
sep_verb(["cassie cas cassie hi there oxidized cas", ["cassie", "cas"]])
sep_verb(['univ sydney, dept pharm, sydney, nsw 2006, australia abstract two distinct mg2+-atpase activities were isolated from triton x-100-solubilized human erythrocyte membranes using a combination of calmodulin-agarose to remove ca2+-atpase and ion exchange chromatography to separate the mg2+-atpase activities',
 ['ca2+', 'mg2+']])

-1

In [28]:
# test cases
print(sep_verb(["The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity.", ["L-GalDH"]]))
print(sep_verb(["Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy.", ["L-GalDH"]]))
print(sep_verb(["We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000).", ["L-GalL dehydrogenase"]]))
print(sep_verb(["The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL.", ["L-GalDH", "dehydro-AsA", "L-GalL"]]))

-1
-1
-1
True


In [29]:
# LF_sep_adverb
# If the chemicals are separated by a adverb, we label FALSE
@labeling_function()
def sep_adverb(x):
    sentence = x[0].replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in x[1]:
        index = -1
        while (index < len(words)):
            if (" " in chem):
                chem_split = chem.split(" ")
                if not(chem_split[-1] in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem_split[-1], index + 1)
            else:
                if not(chem in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem, index + 1)
            indexes.append(index)
    if not indexes:
        return ABSTAIN
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "RB"):
            return False
    return ABSTAIN

In [30]:
# testing sep_adverb (for repeated chemicals)
sep_adverb(["cassie happily danced to the moon with cas", ["cassie", "cas"]])

False

In [31]:
# test cases
print(sep_adverb(["The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity.", ["L-GalDH"]]))
print(sep_adverb(["Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy.", ["L-GalDH"]]))
print(sep_adverb(["We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000).", ["L-GalL dehydrogenase"]]))
print(sep_adverb(["The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL.", ["L-GalDH", "dehydro-AsA", "L-GalL"]]))

-1
-1
-1
False


In [32]:
# LF_includes_oxidation_words
# If the sentence contains oxidation words, we label True
oxidation_words = ["oxidize", "reduce", "oxidization", "redox" "oxidizing", "oxidizes", "oxidize", "reduction",
    "reduces", "oxidise", "oxidising", "oxide", "rust", "corrode"]
@labeling_function()
def includes_oxidation_words(x):
    for word in oxidation_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [33]:
# LF_includes_combustion_words
# If the sentence contains combustion words, we label True
combustion_words = ["combusts", "combustion", "combust", "burn" "explode", "gas-forming"]
@labeling_function()
def includes_combustion_words(x):
    for word in combustion_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [34]:
# LF_includes_neutralization_words
# If the sentence contains neutralization words, we label True
neutralization_words = ["neutralize", "neutralization", "titrate", "titration", "buffer"]
@labeling_function()
def includes_neutralization_words(x):
    for word in neutralization_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [35]:
# LF_includes_catalyze_words
# If the sentence contains catalyze words, we label True
catalyze_words = ["catalyze", "catalyst", "enzyme", "catalyse", "catalysing", "catalyzing"]
@labeling_function()
def includes_catalyze_words(x):
    for word in catalyze_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [36]:
# LF_includes_combination_words
# If the sentence contains combination words, we label True
combination_words = ["combine", "combines", "combination"]
@labeling_function()
def includes_combination_words(x):
    for word in combination_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [37]:
# LF_includes_decomposition_words
# If the sentence contains decomposition words, we label True
decomposition_words = ["decompose", "decomposition"]
@labeling_function()
def includes_decomposition_words(x):
    for word in decomposition_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [38]:
# LF_includes_replace_words
# If the sentence contains decomposition words, we label True
replace_words = ["replaces", "replace", "replacement"]
@labeling_function()
def includes_replace_words(x):
    for word in replace_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [39]:
# LF_includes_reaction_words
# If the sentence contains reactions words, we label True
reaction_words = ["convert", "yield", "produce", "make", "create", "synthesize", "conversion",
    "transformation", "transform", "synthesise", "ferment", "agent", "breakdown", "breaks down",
    "methylaion", "displacement", "combination", "exchange", "break down", "synthesis",
    "precipitate", "precipitation", "transfer", "through", "production", "activates",
    "transamination", "reversible", "form", "demination", "acetylation", "degradation"]
@labeling_function()
def includes_reaction_words(x):
    for word in reaction_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [40]:
# LF_includes_react
# If the sentence contains react, we label True
@labeling_function()
def includes_react(x):
    if("react" in x[0]):
        return True
    return ABSTAIN

In [41]:
# LF_includes_reaction_component_words
# If the sentence contains reaction components words, we label True
reaction_component_words = ["substrate", "product", "reactant", "pathway", "step"]
@labeling_function()
def includes_reaction_component_words(x):
    for word in reaction_component_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [42]:
# LF_comparison
# If the sentence contains comparison words, we label False
comparison_words = ["similar", "more", "greater", "less", "increase", "decrease", "compare", "comparing",
    "difference", "differ"]
@labeling_function()
def comparison(x):
    for word in comparison_words:
        if(word in x[0]):
            return False
    return ABSTAIN

In [43]:
# LF_includes_concentration
# If the sentence contains react, we label True
@labeling_function()
def includes_concentration(x):
    if("concentration" in x[0]):
        return True
    return ABSTAIN

In [44]:
# LF_measure
# If the sentence contains measure words, we label False
measure_words = ["high", "low", "ph", "stability", "corelated", "more", "less", "level", "degree", "time"]
@labeling_function()
def measure(x):
    for word in measure_words:
        if(word in x[0]):
            return False
    return ABSTAIN

In [45]:
# LF_experiment
# If the sentence contains experiement words, we label False
experiment_words = ["mice", "cell", "mouse", "ovary", "male", "female", "animal", "study", "method",
    "test", "treat", "protection", "brain", "nerve", "human"]
@labeling_function()
def experiment(x):
    for word in experiment_words:
        if(word in x[0]):
            return False
    return ABSTAIN

In [46]:
# LF_sep_conversion_words
# If the sentence contains to, from, into, etc., we label True
conversion_words = ["to", "from", "into", "becomes", "became", "by", "conversion"]
@labeling_function()
def sep_converstion_words(x):
    for word in conversion_words:
        if (word in x[0]):
            return True
    return ABSTAIN
    # for word in conversion_words:
    #     for chem in x[1]:
    #         if((word + " " + chem) in x[0]):
    #             return True
    #         elif((chem + " " + word) in x[0]):
    #             return True
    # return ABSTAIN
    # sentence = x[0].replace(',', '')
    # words = sentence.split(" ")
    # indexes = []
    # for chem in x[1]:
    #     index = -1
    #     while (index < len(words)):
    #         if (" " in chem):
    #             chem_split = chem.split(" ")
    #             if not(chem_split[-1] in words[index + 1:]):
    #                 index = len(words)
    #                 break
    #             index = words.index(chem_split[-1], index + 1)
    #         else:
    #             if not(chem in words[index + 1:]):
    #                 index = len(words)
    #                 break
    #             index = words.index(chem, index + 1)
    #         indexes.append(index)
    # if not indexes:
    #     return ABSTAIN
    # min_index = min(indexes)
    # max_index = max(indexes)
    # for index in range(min_index + 1, max_index):
    #     if (words[index] in conversion_words):
    #         return True
    # return ABSTAIN

In [47]:
# LF_num_chemicals
# If the sentence contains one identified chemical, we label False
@labeling_function()
def num_chemicals(x):
    if(len(x[1]) == 1):
            return False
    return ABSTAIN

In [48]:
# test sep_conversion_words
sep_converstion_words(["cassie truns into cas", ["cassie", "cas"]])

True

### my todo + notes

In [34]:
# things to fix
    #  test on jacob data

### importing data in and cleaning it

In [49]:
sentence_df = pd.read_csv("sentence_data_cleaned_jacob.csv")
sentence_df.head()

Unnamed: 0,doc,sentence_index,sentence,entity_type,label,chemicals,genes,substrates,products,enzymes,label index dict,abstract_clean,abstract_expand,abstract_ordered
0,7832763,0,The enzyme cyclo-oxygenase catalyses the oxyge...,"null, null, GENE-N, null, null, null, null, nu...","O, O, B-enzyme, O, O, O, O, B-SUBSTRATE, I-SUB...",prostaglandins,cyclo-oxygenase,"arachidonic, acid",prostaglandins,cyclo-oxygenase,"{2: 'B-enzyme', 7: 'B-SUBSTRATE', 8: 'I-SUBSTR...",The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...
1,7832763,1,Recently two forms of cyclo-oxygenase have bee...,"null, null, null, null, GENE-N, null, null, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O...",,"cyclo-oxygenase, COX-1, COX-2, cytokines",,,,,The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...
2,7832763,2,Constitutive and inducible forms of human cycl...,"null, null, null, null, null, null, GENE-N, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O...",,"cyclo-oxygenase, hCOX-1, hCOX-2",,,,,The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...
3,7832763,3,hCOX-1 had a specific activity of 18.8 mumol o...,"GENE-Y, null, null, null, null, null, null, nu...","B-enzyme, O, O, O, O, O, O, O, O, O, O, O, O, ...",arachidonate,hCOX-1,arachidonate,,hCOX-1,"{0: 'B-enzyme', 17: 'B-SUBSTRATE'}",The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...
4,7832763,4,"of 1500 nmol of O2/nmol of enzyme, whereas hCO...","null, null, null, null, null, null, null, null...","O, O, O, O, O, O, O, O, O, B-enzyme, O, O, O, ...",arachidonate,hCOX-2,arachidonate,,hCOX-2,"{9: 'B-enzyme', 26: 'B-SUBSTRATE'}",The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...


In [50]:
# getting the truth values
def chem_reactions(row):
    if(isinstance(row['substrates'], str) and isinstance(row['products'], str)):
        return 1
    elif(isinstance(row['substrates'], str) or isinstance(row['products'], str)):
        return 0
    else:
        return 0

In [51]:
several_chem_df = sentence_df.dropna(subset=['chemicals'])
several_chem_df.head()

Unnamed: 0,doc,sentence_index,sentence,entity_type,label,chemicals,genes,substrates,products,enzymes,label index dict,abstract_clean,abstract_expand,abstract_ordered
0,7832763,0,The enzyme cyclo-oxygenase catalyses the oxyge...,"null, null, GENE-N, null, null, null, null, nu...","O, O, B-enzyme, O, O, O, O, B-SUBSTRATE, I-SUB...",prostaglandins,cyclo-oxygenase,"arachidonic, acid",prostaglandins,cyclo-oxygenase,"{2: 'B-enzyme', 7: 'B-SUBSTRATE', 8: 'I-SUBSTR...",The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...
3,7832763,3,hCOX-1 had a specific activity of 18.8 mumol o...,"GENE-Y, null, null, null, null, null, null, nu...","B-enzyme, O, O, O, O, O, O, O, O, O, O, O, O, ...",arachidonate,hCOX-1,arachidonate,,hCOX-1,"{0: 'B-enzyme', 17: 'B-SUBSTRATE'}",The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...
4,7832763,4,"of 1500 nmol of O2/nmol of enzyme, whereas hCO...","null, null, null, null, null, null, null, null...","O, O, O, O, O, O, O, O, O, B-enzyme, O, O, O, ...",arachidonate,hCOX-2,arachidonate,,hCOX-2,"{9: 'B-enzyme', 26: 'B-SUBSTRATE'}",The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...
6,7832763,6,"Indomethacin inhibited both hCOX-1 and hCOX-2,...","CHEMICAL, null, null, GENE-Y, null, GENE-Y, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O","Indomethacin, NS-398, Dup-697","hCOX-1, hCOX-2, hCOX-2",,,,,The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...
7,7832763,7,Both NS-398 and Dup-697 exhibited time-depende...,"null, CHEMICAL, null, CHEMICAL, null, null, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O","NS-398, Dup-697, indomethacin",hCOX-2,,,,,The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...


In [52]:
several_chem_df["truth"] = several_chem_df.apply(chem_reactions, axis = 1)
# excluding unsure sentences
# several_chem_df = several_chem_df[several_chem_df['truth'] != -1]
several_chem_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  several_chem_df["truth"] = several_chem_df.apply(chem_reactions, axis = 1)


Unnamed: 0,doc,sentence_index,sentence,entity_type,label,chemicals,genes,substrates,products,enzymes,label index dict,abstract_clean,abstract_expand,abstract_ordered,truth
0,7832763,0,The enzyme cyclo-oxygenase catalyses the oxyge...,"null, null, GENE-N, null, null, null, null, nu...","O, O, B-enzyme, O, O, O, O, B-SUBSTRATE, I-SUB...",prostaglandins,cyclo-oxygenase,"arachidonic, acid",prostaglandins,cyclo-oxygenase,"{2: 'B-enzyme', 7: 'B-SUBSTRATE', 8: 'I-SUBSTR...",The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...,1
3,7832763,3,hCOX-1 had a specific activity of 18.8 mumol o...,"GENE-Y, null, null, null, null, null, null, nu...","B-enzyme, O, O, O, O, O, O, O, O, O, O, O, O, ...",arachidonate,hCOX-1,arachidonate,,hCOX-1,"{0: 'B-enzyme', 17: 'B-SUBSTRATE'}",The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...,0
4,7832763,4,"of 1500 nmol of O2/nmol of enzyme, whereas hCO...","null, null, null, null, null, null, null, null...","O, O, O, O, O, O, O, O, O, B-enzyme, O, O, O, ...",arachidonate,hCOX-2,arachidonate,,hCOX-2,"{9: 'B-enzyme', 26: 'B-SUBSTRATE'}",The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...,0
6,7832763,6,"Indomethacin inhibited both hCOX-1 and hCOX-2,...","CHEMICAL, null, null, GENE-Y, null, GENE-Y, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O","Indomethacin, NS-398, Dup-697","hCOX-1, hCOX-2, hCOX-2",,,,,The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...,0
7,7832763,7,Both NS-398 and Dup-697 exhibited time-depende...,"null, CHEMICAL, null, CHEMICAL, null, null, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O","NS-398, Dup-697, indomethacin",hCOX-2,,,,,The enzyme cyclo-oxygenase catalyses the oxyge...,['The enzyme cyclo-oxygenase catalyses the oxy...,['The enzyme cyclo-oxygenase catalyses the oxy...,0


In [53]:
sentence_chem_df = several_chem_df[["sentence", "chemicals", "truth", "substrates", "products"]]

In [54]:
greek_alphabet = {
    u'\u0393': 'Gamma',
    u'\u0394': 'Delta',
    u'\u0398': 'Theta',
    u'\u039B': 'Lamda',
    u'\u039E': 'Xi',
    u'\u03A0': 'Pi',
    u'\u03A3': 'Sigma',
    u'\u03A6': 'Phi',
    u'\u03A7': 'Chi',
    u'\u03A8': 'Psi',
    u'\u03A9': 'Omega',
    u'\u03B1': 'alpha',
    u'\u03B2': 'beta',
    u'\u03B3': 'gamma',
    u'\u03B4': 'delta',
    u'\u03B5': 'epsilon',
    u'\u03B6': 'zeta',
    u'\u03B7': 'eta',
    u'\u03B8': 'theta',
    u'\u03B9': 'iota',
    u'\u03BA': 'kappa',
    u'\u03BB': 'lamda',
    u'\u03BC': 'mu',
    u'\u03BD': 'nu',
    u'\u03BE': 'xi',
    u'\u03BF': 'omicron',
    u'\u03C0': 'pi',
    u'\u03C1': 'rho',
    u'\u03C3': 'sigma',
    u'\u03C4': 'tau',
    u'\u03C5': 'upsilon',
    u'\u03C6': 'phi',
    u'\u03C7': 'chi',
    u'\u03C8': 'psi',
    u'\u03C9': 'omega'
}

In [55]:
'β' in greek_alphabet.keys()

True

In [56]:
def remove_greek(words):
    for letter in greek_alphabet.keys():
        words = words.replace(letter, greek_alphabet[letter])
    return words


In [57]:
# greek test
remove_greek("11β-hydroxysteroid")

'11beta-hydroxysteroid'

In [58]:
def chem_into_array(chemicals):
    chemicals = remove_greek(chemicals)
    chemicals = chemicals.lower()
    sample_chems = chemicals.replace("%20", " ")
    sample_chems_list = sample_chems.split(", ")
    for index in range(len(sample_chems_list)):
        while(sample_chems_list[index][-1] == ","):
            sample_chems_list[index] = sample_chems_list[index][:-1]
    sample_chems_list = list(set(sample_chems_list))
    return sample_chems_list

In [59]:
# removing excess commas from the end
chem_into_array("cassie,, cassie, cas")

['cas', 'cassie']

In [60]:
def sentence_cleaned(sentence):
    sentence = remove_greek(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace(".", "")
    sentence = sentence.replace(")", "")
    sentence = sentence.replace("(", "")
    sentence = sentence.replace(";", "")
    sentence = sentence.replace(":", "")
    return sentence

In [61]:
sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)


Unnamed: 0,sentence,chemicals,truth,substrates,products
0,the enzyme cyclo-oxygenase catalyses the oxyge...,prostaglandins,1,"arachidonic, acid",prostaglandins
3,hcox-1 had a specific activity of 188 mumol of...,arachidonate,0,arachidonate,
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",arachidonate,0,arachidonate,
6,"indomethacin inhibited both hcox-1 and hcox-2,...","Indomethacin, NS-398, Dup-697",0,,
7,both ns-398 and dup-697 exhibited time-depende...,"NS-398, Dup-697, indomethacin",0,,
...,...,...,...,...,...
1406,cholesterol esterase ce induced surface erosio...,"Cholesterol, PEC, PEC",0,"poly, carbonate, ), PEC",
1408,"during the whole period of degradation, mw of ...",PEC,0,,
1409,water uptake of the polymer was only 28 and 02...,"PEC, PEC",0,,
1411,"by this mechanism, ce-responsive drug in vitro...",PEC,0,,


In [62]:
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)
# sentence_chem_df = sentence_chem_df.loc[sentence_chem_df["chemicals"].str.len() > 1]
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)


Unnamed: 0,sentence,chemicals,truth,substrates,products
0,the enzyme cyclo-oxygenase catalyses the oxyge...,[prostaglandins],1,"arachidonic, acid",prostaglandins
3,hcox-1 had a specific activity of 188 mumol of...,[arachidonate],0,arachidonate,
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",[arachidonate],0,arachidonate,
6,"indomethacin inhibited both hcox-1 and hcox-2,...","[indomethacin, ns-398, dup-697]",0,,
7,both ns-398 and dup-697 exhibited time-depende...,"[indomethacin, ns-398, dup-697]",0,,
...,...,...,...,...,...
1406,cholesterol esterase ce induced surface erosio...,"[cholesterol, pec]",0,"poly, carbonate, ), PEC",
1408,"during the whole period of degradation, mw of ...",[pec],0,,
1409,water uptake of the polymer was only 28 and 02...,[pec],0,,
1411,"by this mechanism, ce-responsive drug in vitro...",[pec],0,,


In [63]:
sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()


Unnamed: 0,sentence,chemicals,truth,substrates,products,text
0,the enzyme cyclo-oxygenase catalyses the oxyge...,[prostaglandins],1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...
3,hcox-1 had a specific activity of 188 mumol of...,[arachidonate],0,arachidonate,,[hcox-1 had a specific activity of 188 mumol o...
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",[arachidonate],0,arachidonate,,"[of 1500 nmol of o2/nmol of enzyme, whereas hc..."
6,"indomethacin inhibited both hcox-1 and hcox-2,...","[indomethacin, ns-398, dup-697]",0,,,[indomethacin inhibited both hcox-1 and hcox-2...
7,both ns-398 and dup-697 exhibited time-depende...,"[indomethacin, ns-398, dup-697]",0,,,[both ns-398 and dup-697 exhibited time-depend...
...,...,...,...,...,...,...
1406,cholesterol esterase ce induced surface erosio...,"[cholesterol, pec]",0,"poly, carbonate, ), PEC",,[cholesterol esterase ce induced surface erosi...
1408,"during the whole period of degradation, mw of ...",[pec],0,,,"[during the whole period of degradation, mw of..."
1409,water uptake of the polymer was only 28 and 02...,[pec],0,,,[water uptake of the polymer was only 28 and 0...
1411,"by this mechanism, ce-responsive drug in vitro...",[pec],0,,,"[by this mechanism, ce-responsive drug in vitr..."


### running test on the imported data

In [64]:
for index in range(len(sentence_chem_df["sentence"])):
    arguments = sentence_chem_df["text"].iloc[index]
    print(arguments)
    print("general chemicals: " + str(general_chemical(arguments)))
    print("adjacent mentions: " + str(adjacent_mentions(arguments)))
    print("sep or: " + str(sep_or(arguments)))
    print("sep and: " + str(sep_and(arguments)))
    print("sep comma: " + str(sep_comma(arguments)))
    print("sep via: " + str(sep_via(arguments)))
    print("sep sym: " + str(sep_sym(arguments)))
    print("followed ase: " + str(followed_ase(arguments)))
    print("group: " + str(group(arguments)))
    print("followed by noun: " + str(followed_by_noun(arguments)))
    print("sep verb: " + str(sep_verb(arguments)))
    print("sep adverb: " + str(sep_adverb(arguments)))
    print("includes reaction words: " + str(includes_reaction_words(arguments)))
    print("sep conversion words: " + str(sep_converstion_words(arguments)))
    print()

['the enzyme cyclo-oxygenase catalyses the oxygenation of arachidonic acid, leading to the formation of prostaglandins', ['prostaglandins']]
general chemicals: False
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: -1
sep verb: -1
sep adverb: -1
includes reaction words: True
sep conversion words: True

['hcox-1 had a specific activity of 188 mumol of o2/mg with a km of 138 microm for arachidonate and vmax', ['arachidonate']]
general chemicals: False
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: -1
sep verb: -1
sep adverb: -1
includes reaction words: -1
sep conversion words: -1

['of 1500 nmol of o2/nmol of enzyme, whereas hcox-2 had a specific activity of 122 mumol of o2/mg with a km of 87 microm for arachidonate and a vmax', ['arachidonate']]
general chemicals: False
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
s

### new csv uncleaned data

In [322]:
uncleaned_df = pd.read_csv("../sentence_annotations_elsevier_pmid_split6.csv")
uncleaned_df

Unnamed: 0,lit_id,indices,start,end,sentence,sentence_pos,enzymes,enzyme_locations,chemical_entities_full,chemical_names,chemical_smiles,name_smile_tuples
0,10.1002/jps.20686,0,0,1297,serial JL 313843 291210 291727 291789 291928 3...,"[('serial', 'JJ'), ('JL', 'NN'), ('313843', 'C...",,[],[],,,[]
1,10.1002/jps.20686,1,1298,1324,Published by Elsevier Inc.,"[('Published', 'VBN'), ('by', 'IN'), ('Elsevie...",,[],[],,,[]
2,10.1002/jps.20686,2,1325,1345,All rights reserved.,"[('All', 'DT'), ('rights', 'NNS'), ('reserved'...",,[],[],,,[]
3,10.1002/jps.20686,3,1346,9469,KINETICANALYSESFORSPECIESDIFFERENCESINPGLYCOPR...,[('KINETICANALYSESFORSPECIESDIFFERENCESINPGLYC...,,[],"[{'text': 'Diltiazem', 'start': 1718, 'end': 1...","Diltiazem,, Cyclosporin%20A,, Dexamethasone",COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...,"[('Diltiazem', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(..."
4,10.1002/jps.20686,4,9470,9599,Immunoblot analyses of P-gp expressed in MDR1 ...,"[('Immunoblot', 'NN'), ('analyses', 'NNS'), ('...",,[],"[{'text': 'H241', 'start': 9594, 'end': 9598, ...",H241,C[C@]12C[C@](O)(C[C@@H]1CC[C@@H]3[C@@H]2CC[C@@...,"[('H241', 'C[C@]12C[C@](O)(C[C@@H]1CC[C@@H]3[C..."
...,...,...,...,...,...,...,...,...,...,...,...,...
952542,10.1263/jbb.99.623,154,29565,29683,A novel ATP regeneration system using polyphos...,"[('A', 'DT'), ('novel', 'JJ'), ('ATP', 'NN'), ...","phosphotransferase, kinase","[('phosphotransferase', 9, 9), ('kinase', 12, ...","[{'text': 'ATP', 'start': 29573, 'end': 29576,...",ATP,Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P](O)(=O)O[...,"[('ATP', 'Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P..."
952543,10.1263/jbb.99.623,155,29684,29723,91 2001 557 563 20 Fujio T. Maruyama A.,"[('91', 'CD'), ('2001', 'CD'), ('557', 'CD'), ...",,[],[],,,[]
952544,10.1263/jbb.99.623,156,29724,29731,Mori H.,"[('Mori', 'NNP'), ('H', 'NNP'), ('.', '.')]",,[],"[{'text': 'H', 'start': 29729, 'end': 29730, '...",H,[H],"[('H', '[H]')]"
952545,10.1263/jbb.99.623,157,29732,29882,Production of useful substances by the couplin...,"[('Production', 'NN'), ('of', 'IN'), ('useful'...",,[],"[{'text': 'ATP', 'start': 29797, 'end': 29800,...","ATP,, ATP",Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P](O)(=O)O[...,"[('ATP', 'Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P..."


In [323]:
uncleaned_no_na_df = uncleaned_df.dropna()
uncleaned_several_chem_df = uncleaned_no_na_df.loc[uncleaned_no_na_df["chemical_names"].str.contains(",")]
uncleaned_several_chem_df = uncleaned_several_chem_df[["sentence", "chemical_names"]]

In [324]:
uncleaned_several_chem_df["sentence"] = uncleaned_several_chem_df["sentence"].apply(sentence_cleaned)
uncleaned_several_chem_df["chemical_names"] = uncleaned_several_chem_df["chemical_names"].apply(chem_into_array)

In [325]:
uncleaned_several_chem_df["chemical_names"]

150       [calcein-am, cyclosporin a, dexamethasone, dil...
271       [sodium dodecyl sulfate, geranyl pyrophosphate...
280                                                    [h+]
281                                                    [h+]
286                                                    [h+]
                                ...                        
952414                    [vancomycin, d-alanine-d-alanine]
952450                         [tetrahydrofurane, methanol]
952517                                      [sucrose, kato]
952526                     [d-alanyl-d-alanine, vancomycin]
952541                                [adenylate, amp, atp]
Name: chemical_names, Length: 50212, dtype: object

In [326]:
re_check_several_chem_uncleaned_df = uncleaned_several_chem_df.loc[uncleaned_several_chem_df["chemical_names"].str.len() > 1]

In [327]:
re_check_several_chem_uncleaned_df

Unnamed: 0,sentence,chemical_names
150,"we selected diltiazem, cyclosporin a, and dexa...","[calcein-am, cyclosporin a, dexamethasone, dil..."
271,"croteau r washington state univ, inst biol che...","[sodium dodecyl sulfate, geranyl pyrophosphate..."
295,chemicalmodificationschickenliverpyruvatecarbo...,"[lysine, pyruvate, cysteine]"
296,"ash de temple univ, hlth sci ctr, sch med, dep...","[pyruvate, cysteine, lysine, o-phthalaldehyde,..."
298,at a one- to two-fold molar excess over active...,"[pyruvate, oxaloacetate, adp]"
...,...,...
952414,key words d-alanine-d-alanine ligase d-amino a...,"[vancomycin, d-alanine-d-alanine]"
952450,the mobile phase consisted of a linear gradien...,"[tetrahydrofurane, methanol]"
952517,weber v falkenhagen d subpol a novel sucrose-b...,"[sucrose, kato]"
952526,active-site mutants of the vanc2 d-alanyl-d-se...,"[d-alanyl-d-alanine, vancomycin]"


In [328]:
re_check_several_chem_uncleaned_df["text"] = re_check_several_chem_uncleaned_df[["sentence", "chemical_names"]].values.tolist()
re_check_several_chem_uncleaned_df.iloc[47]["text"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  re_check_several_chem_uncleaned_df["text"] = re_check_several_chem_uncleaned_df[["sentence", "chemical_names"]].values.tolist()


['univ sydney, dept pharm, sydney, nsw 2006, australia abstract two distinct mg2+-atpase activities were isolated from triton x-100-solubilized human erythrocyte membranes using a combination of calmodulin-agarose to remove ca2+-atpase and ion exchange chromatography to separate the mg2+-atpase activities',
 ['ca2+', 'mg2+']]

In [70]:
for index in range(500, 520):
    sentence = re_check_several_chem_uncleaned_df["text"].iloc[index]
    print(arguments)
    print("general chemicals: " + str(general_chemical(arguments)))
    print("adjacent mentions: " + str(adjacent_mentions(arguments)))
    print("sep or: " + str(sep_or(arguments)))
    print("sep and: " + str(sep_and(arguments)))
    print("sep comma: " + str(sep_comma(arguments)))
    print("sep via: " + str(sep_via(arguments)))
    print("sep sym: " + str(sep_sym(arguments)))
    print("followed ase: " + str(followed_ase(arguments)))
    print("group: " + str(group(arguments)))
    print("followed by noun: " + str(followed_by_noun(arguments)))
    print("sep verb: " + str(sep_verb(arguments)))
    print("sep adverb: " + str(sep_adverb(arguments)))
    print("includes reaction words: " + str(includes_reaction_words(arguments)))
    print("includes reaction component words: " + str(includes_reaction_component_words(arguments)))
    print("sep conversion words: " + str(sep_converstion_words(arguments)))
    print()

['6-dhsg was metabolised by gsh to form a gsh conjugate gs-6-dhsg in raw 2647 cells, via a potential mechanism involving the catalytic activity of glutathione-s-transferase gst', ['gsh', '6-dhsg', 'gs-6-dhsg']]
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: False
sep verb: True
sep adverb: -1
includes reaction words: True
includes reaction component words: -1
sep conversion words: True

['6-dhsg was metabolised by gsh to form a gsh conjugate gs-6-dhsg in raw 2647 cells, via a potential mechanism involving the catalytic activity of glutathione-s-transferase gst', ['gsh', '6-dhsg', 'gs-6-dhsg']]
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: False
sep verb: True
sep adverb: -1
includes reaction words: True
includes reaction component words: -1
sep conversion words: True

['6-dhsg was m

### snorkel code for model

In [68]:
from snorkel.labeling import PandasLFApplier

In [69]:
sentence_chem_df.head()

Unnamed: 0,sentence,chemicals,truth,substrates,products,text
0,the enzyme cyclo-oxygenase catalyses the oxyge...,[prostaglandins],1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...
3,hcox-1 had a specific activity of 188 mumol of...,[arachidonate],0,arachidonate,,[hcox-1 had a specific activity of 188 mumol o...
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",[arachidonate],0,arachidonate,,"[of 1500 nmol of o2/nmol of enzyme, whereas hc..."
6,"indomethacin inhibited both hcox-1 and hcox-2,...","[indomethacin, ns-398, dup-697]",0,,,[indomethacin inhibited both hcox-1 and hcox-2...
7,both ns-398 and dup-697 exhibited time-depende...,"[indomethacin, ns-398, dup-697]",0,,,[both ns-398 and dup-697 exhibited time-depend...


In [70]:
# how to find the location
sentence_chem_df[sentence_chem_df["sentence"] == "even though the activities of mat and gnmt were elevated, the concentration of liver s-adenosylmethionine was decreased 24%, p<0001 and s-adenosylhomocysteine increased 113%, p<0001 in the dwarf mice"]

Unnamed: 0,sentence,chemicals,truth,substrates,products,text
318,even though the activities of mat and gnmt wer...,"[s-adenosylmethionine, s-adenosylhomocysteine]",1,S-adenosylmethionine,S-adenosylhomocysteine,[even though the activities of mat and gnmt we...


In [71]:
# Define the set of labeling functions (LFs)
# currently excluding sep_and
lfs = [solution_words, mixture_words, physical_words, gene_words, structural_words, general_chemical, group,
    paper_artifacts, no_terms, adjacent_mentions, sep_or, sep_comma, sep_via,
    sep_sym, followed_ase, followed_by_noun, sep_verb, sep_adverb, includes_oxidation_words,
    includes_combustion_words, includes_neutralization_words, includes_catalyze_words, includes_combination_words,
    includes_decomposition_words, includes_replace_words, includes_reaction_words,
    includes_reaction_component_words, comparison, sep_converstion_words, num_chemicals, includes_react,
    measure, experiment, includes_concentration]

# Apply the LFs to the unlabeled training data
df_train = sentence_chem_df
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 901/901 [00:06<00:00, 147.46it/s]


In [72]:
L_train

array([[-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1]])

In [73]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
solution_words,0,[0],0.013319,0.013319,0.013319
mixture_words,1,[0],0.00111,0.00111,0.00111
physical_words,2,[0],0.08657,0.08657,0.082131
gene_words,3,[0],0.197558,0.197558,0.173141
structural_words,4,[0],0.18535,0.18535,0.170921
general_chemical,5,[0],0.326304,0.326304,0.305216
group,6,[0],0.239734,0.239734,0.221976
paper_artifacts,7,[0],0.047725,0.047725,0.042175
no_terms,8,[0],0.240844,0.240844,0.217536
adjacent_mentions,9,[0],0.057714,0.057714,0.051054


In [74]:
from snorkel.labeling.model import MajorityLabelVoter

In [75]:
majority_model = MajorityLabelVoter()
df_train["label"] = majority_model.predict(L=L_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["label"] = majority_model.predict(L=L_train)


In [76]:
df_train["label"].value_counts()

 0    716
-1    116
 1     69
Name: label, dtype: int64

In [77]:
df_train.to_csv(r'../labeled.csv')

In [78]:
df_train["truth"].value_counts()

0    866
1     35
Name: truth, dtype: int64

In [79]:
import sklearn as sk
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [80]:
no_abstain_df = df_train[df_train["label"] != -1]
sk.metrics.f1_score(no_abstain_df["truth"], no_abstain_df["label"], average="weighted")

0.9327217168474462

In [82]:
recall_score(no_abstain_df["truth"], no_abstain_df["label"], average="weighted")

0.9146496815286624

In [83]:
precision_score(no_abstain_df["truth"], no_abstain_df["label"], average="weighted")

0.9573945120099097

In [81]:
sk.metrics.f1_score(df_train["truth"], df_train["label"], average="weighted")

0.8658979190787506

In [89]:
recall_score(df_train["truth"], df_train["label"], average="weighted")

  _warn_prf(average, modifier, msg_start, len(result))


0.7968923418423973

In [90]:
precision_score(df_train["truth"], df_train["label"], average="weighted")

0.952927283901127

In [84]:
df_train[df_train["label"] != -1]["truth"].value_counts()

0    759
1     26
Name: truth, dtype: int64

In [85]:
df_train[df_train["label"] != -1]["label"].value_counts()

0    716
1     69
Name: label, dtype: int64

In [86]:
matched_df = df_train[df_train["label"] == df_train["truth"]]
matched_df["label"].value_counts()

0    704
1     14
Name: label, dtype: int64

In [87]:
df_train[df_train["label"] == -1]["truth"].value_counts()

0    107
1      9
Name: truth, dtype: int64

In [88]:
df_train[df_train["label"] != -1]["truth"].value_counts()

0    759
1     26
Name: truth, dtype: int64