# CheRMiT Snorkel Pipeline

This notebook allows you to create a Snorkel classifier, run it on data, and analyze the results. Just run the cells in order!

Precondition: You need the input dataframes to follow a particular order (sentences in the 0th column and chemical lists in the 1st column), because the labeling functions use those indices to access the columns.

### Imports

In [162]:
import pandas as pd
import numpy as np
import os
import nltk
import regex as re
import ast
from collections import defaultdict
from snorkel.labeling import labeling_function

### Setup for Labeling Functions

In [3]:
ABSTAIN = -1

In [4]:
all_lfs = []

### Labeling Functions + small tests written

In [80]:
# includes_solution_words REGEX
# If any of the words is in a list of solution terms, we label FALSE
# https://www.hach.com/chemGlossary   

solution_terms = "buffer|dilute|diluent|dissolv|miscib|\\bsolute|solvent" + \
    "|solub|saturat|saturable|aqueous|gaseous|solid"
@labeling_function()
def includes_solution_words(x):
    structure = "(" + solution_terms + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(includes_solution_words)



In [83]:
#includes_experiment_words
#entropy enthalpy temperature
#"filter|filtr"

In [84]:
# includes_mixture_words
# If any of the words is in a list of mixture terms, we label FALSE
# https://www.hach.com/chemGlossary   
mixture_terms = ["suspended", "mixture", "heterogenous", "homogeneous"]
@labeling_function()
def includes_mixture_words(x):
    for word in mixture_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_mixture_words)

In [85]:
# includes_physical_words
# If there are common physical terms, we label FALSE
physical_terms = ["detection", "composed", "weight", "characteristic", "metal", "express",
    "characterization", "color", "metalic", "consists", "pure", "compose", "assay", "mm", "bound",
    "permeable", "signal", "bind", "property", "stored", "released", "capacity", "resistance", "mol"]
@labeling_function()
def includes_physical_words(x):
    for word in physical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_physical_words)

In [86]:
# includes_genetic_words
# If one of the chemicals is in a list of genetic terms such as reductase, dna, gene, we label FALSE
genetic_terms = ["reductase", "dna", "gene", "allele", "locus", "genotype", "phenotype", "dominant", "recessive", "additive", "phenoset",
    "diallelic", "multiallelic", "polyallelic", "monomorphic", "monoallelic", "polymorphism", "complex", "trait", "multifactorial",
    "polygenic", "monogenic", "mixed model", "transmission probability", "transition probability", "epistasis", "interaction", "pleiotropy",
    "quantitative trait locus", "probit", "logit", "penetrance", "transformation", "scale of measurement", "identity by descent", "identity in state",
    "haplotype", "phase", "multilocus", "genotype", "allelic", "association", "linkage", "disequilibrium", "gametic", "rna"]
@labeling_function()
def includes_genetic_words(x):
    for word in genetic_terms:
        if (word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_genetic_words)
    

In [87]:
# includes_structural_words
# If there are common structural terms, we label FALSE
structural_terms = ["loop", "sequence", "encodes", "code", "codon", "dna", "rna", "pair", "group", "active site",
    "bond", "chain", "gene", "structure", "structural", "encoding", "cdna", "cluster"]
@labeling_function()
def includes_structural_words(x):
    for word in structural_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_structural_words)

In [88]:
# includes_general_chemical_words
# If any of the words is in a list of common chemical terms such as amino acid, sugar, adenosine, amide, adenine, etc..., we label FALSE
# https://www.hach.com/chemGlossary   
chemical_terms = ["amino acid", "sugar", "adenosine", "amide", "adenine", "algaecide", "amines", "base", "biocides",
    "clarifier", "hydrocarbon", "molecule", "nutrients", "polymer", "peptide", "polypeptide",
    "tag", "functional", "electron", "cofactor", "gas"]
@labeling_function()
def includes_general_chemical_words(x):
    for word in chemical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_general_chemical_words)

In [89]:
# includes_functional_group
# If there is a close mention of a functional chemical group, we label FALSE (as it is more likely descriptive of a structure than of a reaction)
# https://www.masterorganicchemistry.com/2010/10/06/functional-groups-organic-chemistry/
common_functional_groups = ["alkane", "alkene", "alkyne", "benzene ring", "phenyl", "amine", "alcohol", "ether", "alkyl halide", "thiol",
    "aldehyde", "ketone", "ester", "carboxylic acid", "amide", "nitrile", "epoxide", "disulfide", "imine", "acid chloride", "anhydride", "nitro",
    "sulfide", "thioether", "group", "functional"]
@labeling_function()
def includes_functional_group(x):
    for group in common_functional_groups:
        if (group in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_functional_group)


In [90]:
# includes_amino_acid
# If there is an amino acid mentioned, we label FALSE
amino_acids = ['val', 'ile', 'leu', 'glu', 'gln', \
    'asp', 'asn' 'his' 'trp', 'phe', 'tyr',    \
    'arg', 'lys', 'ser', 'thr', 'met', 'ala',    \
    'gly', 'pro', 'cys', "amino"]
@labeling_function()
def includes_amino_acid(x):
    for group in amino_acids:
        if (group in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_amino_acid)


In [91]:
# includes_paper_artifacts
# If there are common words from paper headers/footers, we label FALSE
common_terms = ["university", "univ", "pharma", "avenue", "street", "road", "department", "usa", "reference", "ref",
    "keyword", "article", "http", "png", "jpg", "journal", "(20", "(19"]
@labeling_function()
def includes_paper_artifacts(x):
    for word in common_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_paper_artifacts)

In [92]:
# includes_no_terms
# If there are words stopping or with no change, we label FALSE
common_no_terms = ["inactivated", "unaffected", "inactive", "inactivates", "stops", "prevent", "inhibit",
    "denature", "block"]
@labeling_function()
def includes_no_terms(x):
    for word in common_no_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_no_terms)

In [93]:
# puts chemicals separated by or for regex structures
def helper_sep_chems_with_or(chemicals):
    final = ""
    for chem in chemicals:
        if (final == ""):
            final += re.escape(chem)
        else:
            final += "|" + re.escape(chem)
    return final

In [94]:
# structure_adjacent_mentions
# If the chemicals are adjacent, we label FALSE
@labeling_function()
def structure_adjacent_mentions(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_adjacent_mentions)

In [95]:
# structure_sep_or
# If the chemicals are separated by or, we label FALSE
@labeling_function()
def structure_sep_or(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") or (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_or)

In [96]:
# structure_sep_and
# If the chemicals are separated by and, we label FALSE
@labeling_function()
def structure_sep_and(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") and (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_and)

In [97]:
# structure_sep_comma
# If the chemicals are separated by a comma, we label FALSE
@labeling_function()
def structure_sep_comma(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + "), (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_comma)

In [98]:
# LF_sep_via
# If the chemicals are separated by via, in, etc, we label FALSE
via_terms = "via|in"
@labeling_function()
def structure_sep_via(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") (" + via_terms + ") (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_via)

In [99]:
# structure_sep_sym
# If the chemicals are separated by a single character, we label FALSE  
@labeling_function()
def structure_sep_sym(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ").(" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_via)

In [100]:
# structure_followed_by_ase
# If one of the chemicals is followed by a word that ends with -ase, we label FALSE
@labeling_function()
def structure_followed_by_ase(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + r") \w*ase\b"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_followed_by_ase)

In [101]:
# structure_followed_by_noun
# If the first chemical is followed by a noun, we label FALSE
@labeling_function()
def structure_followed_by_noun(x):
    sentence = x[0].replace(',', '')
    chemicals = x[1]
    chem_index = len(sentence)
    chem = ""
    for chemical in chemicals:
        curr_index = sentence.find(chemical)
        if (curr_index < chem_index and curr_index != -1):
            chem_index = curr_index
            chem = chemical
    if (chem == ""):
        return ABSTAIN
    structure = "(" + chem + r") \w+"
    for match in re.finditer(structure, sentence):
        if (nltk.pos_tag([match.group(0).split()[1]])[0][1] == "NN"):
            return False
    return ABSTAIN

all_lfs.append(structure_followed_by_noun)

In [102]:
structure_followed_by_noun(["cassie cs ate", ["cs", "sdfd"]])

False

In [103]:
# structure_sep_verb
# If the chemicals are separated by a verb, we label TRUE
@labeling_function()
def structure_sep_verb(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    if (chemicals == ""):
        return ABSTAIN
    structure = "(" + chemicals + r") \w+ (" + chemicals + ")"
    for match in re.finditer(structure, sentence):
        if (re.match(r"(\bVB|NNS)", nltk.pos_tag([match.group(0).split()[1]])[0][1])):
            return True
    return ABSTAIN

all_lfs.append(structure_sep_verb)

In [104]:
# structure_sep_adverb
# If the chemicals are separated by a adverb, we label FALSE
@labeling_function()
def structure_sep_adverb(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    if (chemicals == ""):
        return ABSTAIN
    structure = "(" + chemicals + r") \w+ (" + chemicals + ")"
    for match in re.finditer(structure, sentence):
        if (re.match(r"(\bRB)", nltk.pos_tag([match.group(0).split()[1]])[0][1])):
            return False
    return ABSTAIN

all_lfs.append(structure_sep_adverb)

In [105]:
structure_sep_adverb(["cassie crazily night away", ["cassie", "night"]])

False

In [106]:
# includes_oxidation_words
# If the sentence contains oxidation words, we label True
oxidation_terms = "oxidiz|oxidis|redox|reduc|rust|corrod|oxygen"
@labeling_function()
def includes_oxidation_words(x):
    structure = "(" + oxidation_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_oxidation_words)

In [107]:
# includes_combustion_words
# If the sentence contains combustion words, we label True
combustion_terms = "combust|burn|explod|gas-form"
@labeling_function()
def includes_combustion_words(x):
    structure = "(" + combustion_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_combustion_words)

In [108]:
# includes_neutralization_words
# If the sentence contains neutralization words, we label True
neutralization_terms = "neutraliz|titrat|buffer|gas-form"
@labeling_function()
def includes_neutralization_words(x):
    structure = "(" + neutralization_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_neutralization_words)

In [109]:
# LF_includes_catalyze_words
# If the sentence contains catalyze words, we label True
# catalyze_words = ["catalyze", "catalyst", "catalyse", "catalysing", "catalyzing"]
# @labeling_function()
# def includes_catalyze_words(x):
#     for word in catalyze_words:
#         if(word in x[0]):
#             return True
#     return ABSTAIN

In [110]:
# includes_combination_words
# If the sentence contains combination words, we label True
combination_terms = "combin"
@labeling_function()
def includes_combination_words(x):
    structure = "(" + combination_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_combination_words)

In [111]:
# includes_decomposition_words
# If the sentence contains decomposition words, we label True
decomposition_terms = "decompos"
@labeling_function()
def includes_decomposition_words(x):
    structure = "(" + decomposition_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_decomposition_words)

In [112]:
# includes_replacement_words
# If the sentence contains decomposition words, we label True
replacement_terms = "replac"
@labeling_function()
def includes_replacement_words(x):
    structure = "(" + replacement_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_replacement_words)

In [113]:
# includes_reaction_words
# If the sentence contains reactions words, we label True
reaction_terms_broad = "conver|yield|produc|mak|creat|synthesiz|synthesis|transform|ferment|break|displac|exchang" + \
                 "|precipit|transfer|through|produc|activat|revers|form|ation|metaboliz|metabolis|generat|hydroly" + \
                 "|methyl|result|modif|revers|from"
@labeling_function()
def includes_reaction_words(x):
    structure = "(" + reaction_terms_broad + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_reaction_words)

In [114]:
# includes_creation_words
# If the sentence contains creation words, we label True
creation_terms = "produc|creat|synthesis|form"

@labeling_function()
def includes_creation_words(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") (" + creation_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_creation_words)

In [115]:
# included_impact_terms
# If the sentence contains impact words, we label True
impact_terms = "caus|lead|result"

@labeling_function()
def includes_impact_words(x):
    structure = "(" + impact_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_impact_words)

In [116]:
# includes_react
# If the sentence contains react, we label True
@labeling_function()
def includes_react(x):
    if("react" in x[0]):
        return True
    return ABSTAIN

all_lfs.append(includes_react)

In [117]:
# includes_react_sym
# If the sentence contains react, we label True
@labeling_function()
def includes_react_sym(x):
    if("-->" in x[0]):
        return True
    return ABSTAIN

all_lfs.append(includes_react_sym)

In [118]:
# includes_reaction_component_words
# If the sentence contains reaction components words, we label True
reaction_component_terms = ["substrate", "product", "reactant", "step"]
@labeling_function()
def includes_reaction_component_words(x):
    for word in reaction_component_terms:
        if(word in x[0]):
            return True
    return ABSTAIN

all_lfs.append(includes_reaction_component_words)

In [119]:
# includes_comparison_words
# If the sentence contains comparison words, we label False
comparison_terms = "similar|more|greater|less|increas|decreas|compar|differ|relativ|better|time|than"
@labeling_function()
def includes_comparison_words(x):
    structure = "(" + comparison_terms + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(includes_comparison_words)

In [120]:
# includes_concentration
# If the sentence contains react, we label True
@labeling_function()
def includes_concentration(x):
    if("concentration" in x[0]):
        return True
    return ABSTAIN

all_lfs.append(includes_concentration)

In [121]:
# includes_measure_words
# If the sentence contains measure words, we label False
measure_terms = ["high", "low", "ph", "stability", "corelated", "more", "less", "level", "degree", "time",
                "measure"]
@labeling_function()
def includes_measure_words(x):
    for word in measure_terms:
        if(word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_measure_words)

In [122]:
# includes_experiment_words
# If the sentence contains experiement words, we label False
experiment_terms = ["mice", "cell", "mouse", "ovary", "male", "female", "animal", "study", "method",
    "test", "treat", "protection", "brain", "nerve", "human", "tissue", "fetal", "vitro", "studies",
    "membrane", "strain", "mutant", "regulate", "dependent", "drug", "therapy", "oral", "test", "autoantigen"]
@labeling_function()
def includes_experiment_words(x):
    for word in experiment_terms:
        if(word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_experiment_words)

In [123]:
# structure_next_to_conversion_words
# If the sentence contains to, from, into, etc., we label True
conversion_terms_next = "to|from|into|becom|became|by"
@labeling_function()
def structure_next_to_conversion_words(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "((" + chemicals + r") (" + conversion_terms_next + r")\b|((" + conversion_terms_next + r")\b (" + chemicals + ")))"
    if (re.search(structure, sentence)):
            return True
    return ABSTAIN

all_lfs.append(structure_next_to_conversion_words)

In [124]:
# structure_sep_conversion_words
# If the sentence contains to, from, into, etc., we label True
conversion_terms_sep = "to|into"
@labeling_function()
def structure_sep_conversion_words(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + r") (" + conversion_terms_sep + r") (" + chemicals + ")"
    if (re.search(structure, sentence)):
            return True
    return ABSTAIN

all_lfs.append(structure_sep_conversion_words)

In [125]:
# structure_conversion_by
# If the sentence contains to, from, into, etc., we label True
conversion_terms_by = "to|from"
@labeling_function()
def structure_conversion_by(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + conversion_terms_by + ") (" + chemicals + ") by"
    if (re.search(structure, sentence)):
            return True
    return ABSTAIN

all_lfs.append(structure_conversion_by)

In [126]:
# structure_conversion_of
# If the sentence contains to, from, into, etc., we label True
conversion_terms_of = "ation of|sion of|ism of"
@labeling_function()
def structure_conversion_of(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "((" + conversion_terms_of + ") (" + chemicals + ")|of (" + chemicals + ") (" + conversion_terms_sep + "))"
    if (re.search(structure, sentence)):
            return True
    return ABSTAIN

all_lfs.append(structure_conversion_of)

In [127]:
includes_impact_words(["the enzyme cyclo-oxygenase catalyses the oxygenation of arachidonic acid, leading to the formation of prostaglandins", ['prostaglandins']])

True

In [128]:
# includes_one_chem
# If the sentence contains one identified chemical, we label False
@labeling_function()
def includes_one_chem(x):
    if(len(x[1]) == 1):
            return False
    return ABSTAIN

all_lfs.append(includes_one_chem)

In [129]:
# # includes_more_than_one_chem
# # If the sentence contains more than one identified chemical, we label True
# @labeling_function()
# def includes_more_than_one_chem(x):
#     if(len(x[1]) > 1):
#             return True
#     return ABSTAIN

# all_lfs.append(includes_more_than_one_chem)

In [130]:
# Phil's version cleaned
# structure_jtsui_pattern_1
# MODIFIED STRUCTURE AND CHANGED ADJ NUM TO 0-3
# If part of the sentence contains the specific structure
# [trigger1] <0,3> chemical [transition] <0,3> chemical, we label True

TRANS = "from|to|into|by|are|yield"
TRIG1 = "phosphoryl|condens|hydrolys|metabol|reduc|conver|produc|form|oxid|transform|bioconver|synthes|react|interconver"
TRANS_p = "(" + TRANS + ")"
TRIG1_p = "(" + TRIG1 + ")"

@labeling_function()
def structure_jtsui_pattern_1(x):
    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"

    structure = r"\b" + r"{}".format(TRIG1_p) + r"\w*(\s\w*){0,3}\s" + r"{}".format(chemicals_p) + r"\s" + r"{}".format(TRANS_p) + r"(\s\w*){0,3}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

In [131]:
# Phil's version cleaned
# structure_jtsui_pattern_1
# MODIFIED STRUCTURE AND CHANGED ADJ NUM TO 0-3
# If part of the sentence contains the specific structure
# [trigger1] <0,3> chemical [transition] <0,3> chemical, we label True

TRANS = "from|to|into|by|are|yield"
TRIG1 = "phosphoryl|condens|hydrolys|metabol|reduc|conver|produc|form|oxid|transform|bioconver|synthes|react|interconver"
TRANS_p = "(" + TRANS + ")"
TRIG1_p = "(" + TRIG1 + ")"

@labeling_function()
def structure_jtsui_pattern_1_copy(x):
    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"

    structure = r"\b" + r"{}".format(TRIG1_p) + r"\w*(\s\w*){0,3}\s" + r"{}".format(chemicals_p) + r"\s" + r"{}".format(TRANS_p) + r"(\s\w*){0,3}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

### final regex LFs ###

In [132]:
# PHIL'S VERSIONS
# structure_jtsui_pattern_2
# If part of the sentence contains the specific structure
# chemical <0,1> [trigger2] <0,1> [transition] <0,1> chemical, we label True
TRIG2 = 'conver|oxid|produc|interconver'
TRIG2_p = "(" + TRIG2 + ")"
@labeling_function()
def structure_jtsui_pattern_2(x):
    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"

#                           chemical                   <0,1> space             [trigger2]         <0,1>  space               [transition]           <0,1>  space         chemical          
    structure = r"\b" + r"{}".format(chemicals_p) + r"(\s\w*){0,1}\s\w*" + r"{}".format(TRIG2_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(TRANS_p) + r"(\s\w*){0,1}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN


# structure_jtsui_pattern_3
# If part of the sentence contains the specific structure
# chemical [trigger3] <0,1> chemical, we label True
TRIG3 = 'yield|generat'
TRIG3_p = "(" + TRIG3 + ")"
@labeling_function()
def structure_jtsui_pattern_3(x):

    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"
#                           chemical               space             [trigger3]         <0,1>  space             chemical          
    structure = r"\b" + r"{}".format(chemicals_p) + r"\s\w*" + r"{}".format(TRIG3_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN



# structure_jtsui_pattern_4
# If part of the sentence contains the specific structure
# [trigger4] <0,1> chemical, we label True
TRIG4 = 'conver|interconver'
TRIG4_p = "(" + TRIG4 + ")"
@labeling_function()
def structure_jtsui_pattern_4(x):

    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"

    structure = r"\b\w*" + r"{}".format(TRIG4_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(chemicals_p) + r"\b"

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN


# structure_jtsui_pattern_5
# If part of the sentence contains the specific structure
# chemical [transition5] <0,1> [trigger5] <0,1> chemical, we label True
TRIG5 = 'produc|metaboli'
TRIG5_p = "(" + TRIG5 + ")"
TRANS5 = 'is|are'
TRANS5_p = "(" + TRANS5 + ")"
@labeling_function()
def structure_jtsui_pattern_5(x):
    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"
#                           chemical               space             [transition5]         <0,1>  space             [trigger5]             <0,1> space            chemical      
    structure = r"\b" + r"{}".format(chemicals_p) + r"\s\w*" + r"{}".format(TRANS5_p) + r"\w*(\s\w*){0,1}\s\w*" + r"{}".format(TRIG5_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

In [133]:
# PHIL'S VERSIONS
# structure_jtsui_pattern_2
# If part of the sentence contains the specific structure
# chemical <0,1> [trigger2] <0,1> [transition] <0,1> chemical, we label True
TRIG2 = 'conver|oxid|produc|interconver'
TRIG2_p = "(" + TRIG2 + ")"
@labeling_function()
def structure_jtsui_pattern_2_copy(x):
    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"

#                           chemical                   <0,1> space             [trigger2]         <0,1>  space               [transition]           <0,1>  space         chemical          
    structure = r"\b" + r"{}".format(chemicals_p) + r"(\s\w*){0,1}\s\w*" + r"{}".format(TRIG2_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(TRANS_p) + r"(\s\w*){0,1}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN


# structure_jtsui_pattern_3
# If part of the sentence contains the specific structure
# chemical [trigger3] <0,1> chemical, we label True
TRIG3 = 'yield|generat'
TRIG3_p = "(" + TRIG3 + ")"
@labeling_function()
def structure_jtsui_pattern_3_copy(x):

    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"
#                           chemical               space             [trigger3]         <0,1>  space             chemical          
    structure = r"\b" + r"{}".format(chemicals_p) + r"\s\w*" + r"{}".format(TRIG3_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN



# structure_jtsui_pattern_4
# If part of the sentence contains the specific structure
# [trigger4] <0,1> chemical, we label True
TRIG4 = 'conver|interconver'
TRIG4_p = "(" + TRIG4 + ")"
@labeling_function()
def structure_jtsui_pattern_4_copy(x):

    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"

    structure = r"\b\w*" + r"{}".format(TRIG4_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(chemicals_p) + r"\b"

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN


# structure_jtsui_pattern_5
# If part of the sentence contains the specific structure
# chemical [transition5] <0,1> [trigger5] <0,1> chemical, we label True
TRIG5 = 'produc|metaboli'
TRIG5_p = "(" + TRIG5 + ")"
TRANS5 = 'is|are'
TRANS5_p = "(" + TRANS5 + ")"
@labeling_function()
def structure_jtsui_pattern_5_copy(x):
    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"
#                           chemical               space             [transition5]         <0,1>  space             [trigger5]             <0,1> space            chemical      
    structure = r"\b" + r"{}".format(chemicals_p) + r"\s\w*" + r"{}".format(TRANS5_p) + r"\w*(\s\w*){0,1}\s\w*" + r"{}".format(TRIG5_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

In [134]:
# includes_interconvert_sym
# If the sentence contains the interconvert symbol, we label TRUE
#
# POTENTIAL CHANGE: symbol must be inbetween two chemicals?
@labeling_function()
def includes_interconvert_sym(x):
    if("<->" in x[0]):
        return True
    if("<-->" in x[0]):
        return True
    return ABSTAIN

all_lfs.append(includes_interconvert_sym)

In [135]:
# LF_includes_emzyme_words
# If the sentence contains enzyme words, we label True
enzyme_words = ["enzyme", "enzymes"]
@labeling_function()
def includes_emzyze_words(x):
    for word in enzyme_words:
        if(word in x[0]):
            return True
    return ABSTAIN

# LF_includes_catalyze_words
# If the sentence contains catalyze words, we label True
catalyze_words = ["catalyze", "catalyst", "catalyse", "catalysing", "catalyzing"]
@labeling_function()
def includes_catalyze_words(x):
    for word in catalyze_words:
        if(word in x[0]):
            return True
    return ABSTAIN

# structure_followed_by_ase
# If one of the chemicals is followed by a word that ends with -ase, we label FALSE
@labeling_function()
def structure_followed_by_ase(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + r") \w*ase\b"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_followed_by_ase)

In [136]:
# includes_reaction_words
# If the sentence contains reactions words, we label True
reaction_terms_specific = "conver|yield|synthesiz|synthesis|oxid|reduc|phosphorylat" + \
                 "|form|metaboliz|metabolis|generat|hydroly" + \
                 "|methylat|brominat|aminat|dehydrat|condensat|degradat|decompos|carboxylat"
@labeling_function()
def includes_reaction_words2(x):
    structure = "(" + reaction_terms_specific + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_reaction_words2)

# includes_reaction_words
# If the sentence contains reactions words, we label True

@labeling_function()
def includes_reaction_words3(x):
    structure = "(" + reaction_terms_specific + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_reaction_words3)

# includes_reaction_words
# If the sentence contains reactions words, we label True

@labeling_function()
def includes_reaction_words4(x):
    structure = "(" + reaction_terms_specific + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_reaction_words4)

# includes_reaction_words
# If the sentence contains reactions words, we label True

@labeling_function()
def includes_reaction_words5(x):
    structure = "(" + reaction_terms_specific + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_reaction_words5)

### data cleaning stuff

In [56]:
greek_alphabet = {
    u'\u0393': 'Gamma',
    u'\u0394': 'Delta',
    u'\u0398': 'Theta',
    u'\u039B': 'Lamda',
    u'\u039E': 'Xi',
    u'\u03A0': 'Pi',
    u'\u03A3': 'Sigma',
    u'\u03A6': 'Phi',
    u'\u03A7': 'Chi',
    u'\u03A8': 'Psi',
    u'\u03A9': 'Omega',
    u'\u03B1': 'alpha',
    u'\u03B2': 'beta',
    u'\u03B3': 'gamma',
    u'\u03B4': 'delta',
    u'\u03B5': 'epsilon',
    u'\u03B6': 'zeta',
    u'\u03B7': 'eta',
    u'\u03B8': 'theta',
    u'\u03B9': 'iota',
    u'\u03BA': 'kappa',
    u'\u03BB': 'lamda',
    u'\u03BC': 'mu',
    u'\u03BD': 'nu',
    u'\u03BE': 'xi',
    u'\u03BF': 'omicron',
    u'\u03C0': 'pi',
    u'\u03C1': 'rho',
    u'\u03C3': 'sigma',
    u'\u03C4': 'tau',
    u'\u03C5': 'upsilon',
    u'\u03C6': 'phi',
    u'\u03C7': 'chi',
    u'\u03C8': 'psi',
    u'\u03C9': 'omega'
}

In [57]:
def remove_greek(words):
    for letter in greek_alphabet.keys():
        words = words.replace(letter, greek_alphabet[letter])
    return words

In [58]:
def chem_into_array(chemicals):
    if (chemicals == "0"):
        return []
    chemicals = remove_greek(chemicals)
    chemicals = chemicals.lower()
    sample_chems = chemicals.replace("%20", " ")
    sample_chems_list = sample_chems.split(", ")
    for index in range(len(sample_chems_list)):
        while(sample_chems_list[index][-1] == ","):
            sample_chems_list[index] = sample_chems_list[index][:-1]
    sample_chems_list = list(set(sample_chems_list))
    return sample_chems_list

In [59]:
def sentence_cleaned(sentence):
    sentence = remove_greek(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace(".", "")
    sentence = sentence.replace(")", "")
    sentence = sentence.replace("(", "")
    sentence = sentence.replace(";", "")
    sentence = sentence.replace(":", "")
    return sentence

### importing data in and cleaning it (this data has truth values)

In [60]:
# have this csv file in the same folder
sentence_df = pd.read_csv("sentence_data_cleaned_csv_fixed.csv")

In [61]:
sentence_chem_df = sentence_df[["sentence", "chemicals", "truth", "substrates", "products"]]
sentence_chem_df.head()

Unnamed: 0,sentence,chemicals,truth,substrates,products
0,The enzyme cyclo-oxygenase catalyses the oxyge...,"arachidonic acid, prostaglandins",1,"arachidonic, acid",prostaglandins
1,Although the exact site of decarboxylation of ...,"levodopa, dopamine",1,,
2,The enzyme dimethylarginine dimethylaminohydro...,"dimethylarginine, arginine, citrulline, methyl...",1,arginine,
3,BACKGROUND AND AIMS: Glutamic acid decarboxyla...,"glutamate, GABA",1,glutamate,"gamma-aminobutyric, acid, GABA"
4,"Mazindane (26) was found to be a pro-drug, oxi...","Mazindane, 5-H, 5-OH, mazindol",1,,


In [62]:
import math
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].replace(np.NaN, "0")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].replace(np.NaN, "0")


In [63]:
sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)
sentence_chem_df["truth"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)


0    1361
1      61
Name: truth, dtype: int64

In [64]:
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)


In [65]:
sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()


Unnamed: 0,sentence,chemicals,truth,substrates,products,text
0,the enzyme cyclo-oxygenase catalyses the oxyge...,"[prostaglandins, arachidonic acid]",1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...
1,although the exact site of decarboxylation of ...,"[dopamine, levodopa]",1,,,[although the exact site of decarboxylation of...
2,the enzyme dimethylarginine dimethylaminohydro...,"[arginine, methylamines, dimethylarginine, cit...",1,arginine,,[the enzyme dimethylarginine dimethylaminohydr...
3,background and aims glutamic acid decarboxylas...,"[glutamate, gaba]",1,glutamate,"gamma-aminobutyric, acid, GABA",[background and aims glutamic acid decarboxyla...
4,"mazindane 26 was found to be a pro-drug, oxidi...","[mazindane, mazindol, 5-h, 5-oh]",1,,,"[mazindane 26 was found to be a pro-drug, oxid..."
...,...,...,...,...,...,...
1417,only two exceptions had been reported-the ente...,"[n-alpha-acetylornithine, ornithine]",1,ornithine,N-alpha-acetylornithine,[only two exceptions had been reported-the ent...
1418,the energetics is here reported for the action...,"[l-beta-lysine, l-lysine]",1,"l-lysine, l-beta-lysine","l-lysine, l-beta-lysine",[the energetics is here reported for the actio...
1419,estradiol 17 beta-hydroxysteroid dehydrogenase...,"[estradiol, estrone]",1,"estrone, estradiol","estrone, estradiol",[estradiol 17 beta-hydroxysteroid dehydrogenas...
1420,5-hydroxymethyltryptophan and 5-hydroxy-4-meth...,"[5-hydroxy-4-methyltryptophan, 5-methyltryptop...",1,5-methyltryptophan,"5-Hydroxymethyltryptophan, 5-hydroxy-4-methylt...",[5-hydroxymethyltryptophan and 5-hydroxy-4-met...


In [66]:
hand_anno_mult_chem_df = sentence_chem_df[sentence_chem_df['chemicals'].map(len) > 1]
hand_anno_mult_chem_df.head()

Unnamed: 0,sentence,chemicals,truth,substrates,products,text
0,the enzyme cyclo-oxygenase catalyses the oxyge...,"[prostaglandins, arachidonic acid]",1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...
1,although the exact site of decarboxylation of ...,"[dopamine, levodopa]",1,,,[although the exact site of decarboxylation of...
2,the enzyme dimethylarginine dimethylaminohydro...,"[arginine, methylamines, dimethylarginine, cit...",1,arginine,,[the enzyme dimethylarginine dimethylaminohydr...
3,background and aims glutamic acid decarboxylas...,"[glutamate, gaba]",1,glutamate,"gamma-aminobutyric, acid, GABA",[background and aims glutamic acid decarboxyla...
4,"mazindane 26 was found to be a pro-drug, oxidi...","[mazindane, mazindol, 5-h, 5-oh]",1,,,"[mazindane 26 was found to be a pro-drug, oxid..."


### data with no truth values

In [67]:
# have this data one folder up
uncleaned_df = pd.read_csv("sentence_annotations_elsevier_pmid_split6.csv")

In [68]:
uncleaned_df.head()

Unnamed: 0,lit_id,indices,start,end,sentence,sentence_pos,enzymes,enzyme_locations,chemical_entities_full,chemical_names,chemical_smiles,name_smile_tuples
0,10.1002/jps.20686,0,0,1297,serial JL 313843 291210 291727 291789 291928 3...,"[('serial', 'JJ'), ('JL', 'NN'), ('313843', 'C...",,[],[],,,[]
1,10.1002/jps.20686,1,1298,1324,Published by Elsevier Inc.,"[('Published', 'VBN'), ('by', 'IN'), ('Elsevie...",,[],[],,,[]
2,10.1002/jps.20686,2,1325,1345,All rights reserved.,"[('All', 'DT'), ('rights', 'NNS'), ('reserved'...",,[],[],,,[]
3,10.1002/jps.20686,3,1346,9469,KINETICANALYSESFORSPECIESDIFFERENCESINPGLYCOPR...,[('KINETICANALYSESFORSPECIESDIFFERENCESINPGLYC...,,[],"[{'text': 'Diltiazem', 'start': 1718, 'end': 1...","Diltiazem,, Cyclosporin%20A,, Dexamethasone",COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...,"[('Diltiazem', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(..."
4,10.1002/jps.20686,4,9470,9599,Immunoblot analyses of P-gp expressed in MDR1 ...,"[('Immunoblot', 'NN'), ('analyses', 'NNS'), ('...",,[],"[{'text': 'H241', 'start': 9594, 'end': 9598, ...",H241,C[C@]12C[C@](O)(C[C@@H]1CC[C@@H]3[C@@H]2CC[C@@...,"[('H241', 'C[C@]12C[C@](O)(C[C@@H]1CC[C@@H]3[C..."


In [69]:
for i in range(5):
    print(uncleaned_df.iloc[i]['sentence'])
    print("")
    print(uncleaned_df.iloc[i]['chemical_entities_full'])
    print('')

serial JL 313843 291210 291727 291789 291928 31 Journal of Pharmaceutical Sciences JOURNALPHARMACEUTICALSCIENCES 2016-01-28 2016-01-28 2016-01-28 2016-01-28 2016-01-28T13:28:29 1-s2.0-S0022354916321499 S0022-3549(16)32149-9 S0022354916321499 10.1002/jps.20686 S300 S300.1 FULL-TEXT 1-s2.0-S0022354906X9500X 2016-08-24T12:59:06.644747-04:00 0 0 20061201 20061231 2006 2016-01-28T14:54:25.366606Z articleinfo articletitlenorm authfirstinitialnorm authfirstsurnamenorm cid cids contenttype copyright crossmark dateloaded dateloadedtxt datesearch datesort dateupdated dco docsubtype doctype doi eid ewtransactionid hubeid indexeddate issfirst issn issnnorm issuelist itemstage itemtransactionid itemweight openaccess openarchive pg pgfirst pglast pii piinorm pubdateend pubdatestart pubdatetxt pubyr sectiontitle sortorder srctitle srctitlenorm srctype ssids alllist content subj subheadings tomb volfirst volissue volumelist webpdf webpdfpagecount yearnav figure table body mmlmath acknowledge affil art

In [150]:
uncleaned_no_na_df = uncleaned_df.dropna()
uncleaned_several_chem_df = uncleaned_no_na_df.loc[uncleaned_no_na_df["chemical_names"].str.contains(",")]
#uncleaned_several_chem_df = uncleaned_several_chem_df[["sentence", "chemical_names"]]
uncleaned_several_chem_df

Unnamed: 0,lit_id,indices,start,end,sentence,sentence_pos,enzymes,enzyme_locations,chemical_entities_full,chemical_names,chemical_smiles,name_smile_tuples
150,10.1002/jps.20686,150,33180,33589,"We selected diltiazem, cyclosporin A, and dexa...","[('We', 'PRP'), ('selected', 'VBD'), ('diltiaz...",ATPase,"[('ATPase', 22, 22)]","[{'text': 'diltiazem', 'start': 33192, 'end': ...","diltiazem,, cyclosporin%20A,, dexamethasone,, ...",COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...,"[('diltiazem', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(..."
271,10.1006/abbi.1993.1114,3,2464,2912,"Croteau R. Washington State Univ, Inst Biol Ch...","[('Croteau', 'NNP'), ('R.', 'NNP'), ('Washingt...","synthase, cyclase","[('synthase', 34, 34), ('cyclase', 38, 38)]","[{'text': 'Limonene', 'start': 2620, 'end': 26...","Limonene,, geranyl%20pyrophosphate,, sodium%20...","CC(=C)C1CCC(=CC1)C,, CC(C)=CCCC(/C)=C/CO[P]([O...","[('Limonene', 'CC(=C)C1CCC(=CC1)C'), ('geranyl..."
280,10.1006/abbi.1993.1129,3,2426,2514,Storage Tissue: H+/Substrate Stoichiometries f...,"[('Storage', 'NNP'), ('Tissue', 'NNP'), (':', ...","ATPase, PPase","[('ATPase', 11, 11), ('PPase', 15, 15)]","[{'text': 'H+', 'start': 2442, 'end': 2444, 's...","H+,, H+,, H+","[H+],, [H+],, [H+]","[('H+', '[H+]'), ('H+', '[H+]'), ('H+', '[H+]')]"
281,10.1006/abbi.1993.1129,4,2515,2716,"Briskin D.P. Univ Illinois, Dept Agron, 1201 W...","[('Briskin', 'NNP'), ('D.P.', 'NNP'), ('Univ',...","ATPase, PPase","[('ATPase', 30, 30), ('PPase', 34, 34)]","[{'text': 'H+', 'start': 2609, 'end': 2611, 's...","H+,, H+,, H+","[H+],, [H+],, [H+]","[('H+', '[H+]'), ('H+', '[H+]'), ('H+', '[H+]')]"
286,10.1006/abbi.1993.1129,9,3367,3717,From these results and the estimated level of ...,"[('From', 'IN'), ('these', 'DT'), ('results', ...","ATPase, PPase","[('ATPase', 27, 27), ('PPase', 31, 31)]","[{'text': 'H+', 'start': 3509, 'end': 3511, 's...","H+,, H+,, H+","[H+],, [H+],, [H+]","[('H+', '[H+]'), ('H+', '[H+]'), ('H+', '[H+]')]"
...,...,...,...,...,...,...,...,...,...,...,...,...
952414,10.1263/jbb.99.623,26,11239,11493,Key words D-alanine-D-alanine ligase D-amino a...,"[('Key', 'JJ'), ('words', 'NNS'), ('D-alanine-...",ligase,"[('ligase', 3, 3)]","[{'text': 'D-alanine-D-alanine', 'start': 1124...","D-alanine-D-alanine,, vancomycin","C[C@@H](N)C(O)=O.C[C@@H](N)C(O)=O,, CN[C@H](CC...","[('D-alanine-D-alanine', 'C[C@@H](N)C(O)=O.C[C..."
952450,10.1263/jbb.99.623,62,17119,17420,The mobile phase consisted of a linear gradien...,"[('The', 'DT'), ('mobile', 'JJ'), ('phase', 'N...",phase,"[('phase', 2, 2)]","[{'text': 'methanol', 'start': 17213, 'end': 1...","methanol,, methanol,, tetrahydrofurane","CO,, CO,, C1CCOC1","[('methanol', 'CO'), ('methanol', 'CO'), ('tet..."
952517,10.1263/jbb.99.623,129,27150,27373,Weber V. Falkenhagen D. SUBPOL: a novel sucros...,"[('Weber', 'NNP'), ('V.', 'NNP'), ('Falkenhage...",phase,"[('phase', 16, 16)]","[{'text': 'sucrose', 'start': 27190, 'end': 27...","sucrose,, Kato",OC[C@H]1O[C@H](O[C@]2(CO)O[C@H](CO)[C@@H](O)[C...,"[('sucrose', 'OC[C@H]1O[C@H](O[C@]2(CO)O[C@H](..."
952526,10.1263/jbb.99.623,138,28139,28364,Active-site mutants of the VanC2 D-alanyl-D-se...,"[('Active', 'JJ'), ('-', 'HYPH'), ('site', 'NN...","ligase, ligase","[('ligase', 8, 8), ('ligase', 25, 25)]","[{'text': 'vancomycin', 'start': 28220, 'end':...","vancomycin,, D-alanyl-D-alanine",CN[C@H](CC(C)C)C(=O)NC1[C@H](O)c2ccc(Oc3cc4cc(...,"[('vancomycin', 'CN[C@H](CC(C)C)C(=O)NC1[C@H](..."


In [151]:
uncleaned_several_chem_df["sentence"] = uncleaned_several_chem_df["sentence"].apply(sentence_cleaned)
uncleaned_several_chem_df["chemical_names"] = uncleaned_several_chem_df["chemical_names"].apply(chem_into_array)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uncleaned_several_chem_df["sentence"] = uncleaned_several_chem_df["sentence"].apply(sentence_cleaned)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uncleaned_several_chem_df["chemical_names"] = uncleaned_several_chem_df["chemical_names"].apply(chem_into_array)


In [152]:
# need to get rid of this line if this data should be used
re_check_several_chem_uncleaned_df = uncleaned_several_chem_df

In [153]:
re_check_several_chem_uncleaned_df["text"] = re_check_several_chem_uncleaned_df[["sentence", "chemical_names"]].values.tolist()
brenda_clean_df = re_check_several_chem_uncleaned_df
brenda_mult_chem_df = brenda_clean_df[brenda_clean_df['chemical_names'].map(len) > 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  re_check_several_chem_uncleaned_df["text"] = re_check_several_chem_uncleaned_df[["sentence", "chemical_names"]].values.tolist()


In [154]:
brenda_mult_chem_df

Unnamed: 0,lit_id,indices,start,end,sentence,sentence_pos,enzymes,enzyme_locations,chemical_entities_full,chemical_names,chemical_smiles,name_smile_tuples,text
150,10.1002/jps.20686,150,33180,33589,"we selected diltiazem, cyclosporin a, and dexa...","[('We', 'PRP'), ('selected', 'VBD'), ('diltiaz...",ATPase,"[('ATPase', 22, 22)]","[{'text': 'diltiazem', 'start': 33192, 'end': ...","[cyclosporin a, diltiazem, dexamethasone, calc...",COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...,"[('diltiazem', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(...","[we selected diltiazem, cyclosporin a, and dex..."
271,10.1006/abbi.1993.1114,3,2464,2912,"croteau r washington state univ, inst biol che...","[('Croteau', 'NNP'), ('R.', 'NNP'), ('Washingt...","synthase, cyclase","[('synthase', 34, 34), ('cyclase', 38, 38)]","[{'text': 'Limonene', 'start': 2620, 'end': 26...","[limonene, sodium dodecyl sulfate, geranyl pyr...","CC(=C)C1CCC(=CC1)C,, CC(C)=CCCC(/C)=C/CO[P]([O...","[('Limonene', 'CC(=C)C1CCC(=CC1)C'), ('geranyl...","[croteau r washington state univ, inst biol ch..."
295,10.1006/abbi.1993.1275,2,1178,2551,chemicalmodificationschickenliverpyruvatecarbo...,[('CHEMICALMODIFICATIONSCHICKENLIVERPYRUVATECA...,Carboxylase,"[('Carboxylase', 143, 143)]","[{'text': 'Pyruvate', 'start': 2438, 'end': 24...","[lysine, pyruvate, cysteine]","CC(=O)C([O-])=O,, N[C@@H](CS)C(O)=O,, NCCCC[C@...","[('Pyruvate', 'CC(=O)C([O-])=O'), ('Cysteine',...",[chemicalmodificationschickenliverpyruvatecarb...
296,10.1006/abbi.1993.1275,3,2552,2895,"ash de temple univ, hlth sci ctr, sch med, dep...","[('Ash', 'NNP'), ('D.E.', 'NNP'), ('Temple', '...",carboxylase,"[('carboxylase', 32, 32)]","[{'text': 'pyruvate', 'start': 2696, 'end': 27...","[cysteine, n-(7-dimethylamino-4-methyl-3-couma...","CC(=O)C([O-])=O,, CN(C)c1ccc2C(=C(N3C(=O)C=CC3...","[('pyruvate', 'CC(=O)C([O-])=O'), ('N-(7-dimet...","[ash de temple univ, hlth sci ctr, sch med, de..."
298,10.1006/abbi.1993.1275,5,3008,3317,at a one- to two-fold molar excess over active...,"[('At', 'IN'), ('a', 'DT'), ('one-', 'CD'), ('...","carboxylase, decarboxylase","[('carboxylase', 20, 20), ('decarboxylase', 28...","[{'text': 'pyruvate', 'start': 3120, 'end': 31...","[pyruvate, oxaloacetate, adp]","CC(=O)C([O-])=O,, Nc1ncnc2n(cnc12)C3OC(CO[P](O...","[('pyruvate', 'CC(=O)C([O-])=O'), ('ADP', 'Nc1...",[at a one- to two-fold molar excess over activ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
952414,10.1263/jbb.99.623,26,11239,11493,key words d-alanine-d-alanine ligase d-amino a...,"[('Key', 'JJ'), ('words', 'NNS'), ('D-alanine-...",ligase,"[('ligase', 3, 3)]","[{'text': 'D-alanine-D-alanine', 'start': 1124...","[vancomycin, d-alanine-d-alanine]","C[C@@H](N)C(O)=O.C[C@@H](N)C(O)=O,, CN[C@H](CC...","[('D-alanine-D-alanine', 'C[C@@H](N)C(O)=O.C[C...",[key words d-alanine-d-alanine ligase d-amino ...
952450,10.1263/jbb.99.623,62,17119,17420,the mobile phase consisted of a linear gradien...,"[('The', 'DT'), ('mobile', 'JJ'), ('phase', 'N...",phase,"[('phase', 2, 2)]","[{'text': 'methanol', 'start': 17213, 'end': 1...","[tetrahydrofurane, methanol]","CO,, CO,, C1CCOC1","[('methanol', 'CO'), ('methanol', 'CO'), ('tet...",[the mobile phase consisted of a linear gradie...
952517,10.1263/jbb.99.623,129,27150,27373,weber v falkenhagen d subpol a novel sucrose-b...,"[('Weber', 'NNP'), ('V.', 'NNP'), ('Falkenhage...",phase,"[('phase', 16, 16)]","[{'text': 'sucrose', 'start': 27190, 'end': 27...","[kato, sucrose]",OC[C@H]1O[C@H](O[C@]2(CO)O[C@H](CO)[C@@H](O)[C...,"[('sucrose', 'OC[C@H]1O[C@H](O[C@]2(CO)O[C@H](...",[weber v falkenhagen d subpol a novel sucrose-...
952526,10.1263/jbb.99.623,138,28139,28364,active-site mutants of the vanc2 d-alanyl-d-se...,"[('Active', 'JJ'), ('-', 'HYPH'), ('site', 'NN...","ligase, ligase","[('ligase', 8, 8), ('ligase', 25, 25)]","[{'text': 'vancomycin', 'start': 28220, 'end':...","[vancomycin, d-alanyl-d-alanine]",CN[C@H](CC(C)C)C(=O)NC1[C@H](O)c2ccc(Oc3cc4cc(...,"[('vancomycin', 'CN[C@H](CC(C)C)C(=O)NC1[C@H](...",[active-site mutants of the vanc2 d-alanyl-d-s...


### snorkel code for model (running on data with truth values)

In [75]:
from snorkel.labeling import PandasLFApplier

In [76]:
# how to find the location
# sentence_chem_df[sentence_chem_df["sentence"] == "even though the activities of mat and gnmt were elevated, the concentration of liver s-adenosylmethionine was decreased 24%, p<0001 and s-adenosylhomocysteine increased 113%, p<0001 in the dwarf mice"]

In [77]:
sentence_chem_df.head()

Unnamed: 0,sentence,chemicals,truth,substrates,products,text
0,the enzyme cyclo-oxygenase catalyses the oxyge...,"[prostaglandins, arachidonic acid]",1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...
1,although the exact site of decarboxylation of ...,"[dopamine, levodopa]",1,,,[although the exact site of decarboxylation of...
2,the enzyme dimethylarginine dimethylaminohydro...,"[arginine, methylamines, dimethylarginine, cit...",1,arginine,,[the enzyme dimethylarginine dimethylaminohydr...
3,background and aims glutamic acid decarboxylas...,"[glutamate, gaba]",1,glutamate,"gamma-aminobutyric, acid, GABA",[background and aims glutamic acid decarboxyla...
4,"mazindane 26 was found to be a pro-drug, oxidi...","[mazindane, mazindol, 5-h, 5-oh]",1,,,"[mazindane 26 was found to be a pro-drug, oxid..."


In [155]:
brenda_mult_chem_df = brenda_mult_chem_df.reset_index(drop=True)
brenda_mult_chem_df.head()

Unnamed: 0,lit_id,indices,start,end,sentence,sentence_pos,enzymes,enzyme_locations,chemical_entities_full,chemical_names,chemical_smiles,name_smile_tuples,text
0,10.1002/jps.20686,150,33180,33589,"we selected diltiazem, cyclosporin a, and dexa...","[('We', 'PRP'), ('selected', 'VBD'), ('diltiaz...",ATPase,"[('ATPase', 22, 22)]","[{'text': 'diltiazem', 'start': 33192, 'end': ...","[cyclosporin a, diltiazem, dexamethasone, calc...",COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...,"[('diltiazem', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(...","[we selected diltiazem, cyclosporin a, and dex..."
1,10.1006/abbi.1993.1114,3,2464,2912,"croteau r washington state univ, inst biol che...","[('Croteau', 'NNP'), ('R.', 'NNP'), ('Washingt...","synthase, cyclase","[('synthase', 34, 34), ('cyclase', 38, 38)]","[{'text': 'Limonene', 'start': 2620, 'end': 26...","[limonene, sodium dodecyl sulfate, geranyl pyr...","CC(=C)C1CCC(=CC1)C,, CC(C)=CCCC(/C)=C/CO[P]([O...","[('Limonene', 'CC(=C)C1CCC(=CC1)C'), ('geranyl...","[croteau r washington state univ, inst biol ch..."
2,10.1006/abbi.1993.1275,2,1178,2551,chemicalmodificationschickenliverpyruvatecarbo...,[('CHEMICALMODIFICATIONSCHICKENLIVERPYRUVATECA...,Carboxylase,"[('Carboxylase', 143, 143)]","[{'text': 'Pyruvate', 'start': 2438, 'end': 24...","[lysine, pyruvate, cysteine]","CC(=O)C([O-])=O,, N[C@@H](CS)C(O)=O,, NCCCC[C@...","[('Pyruvate', 'CC(=O)C([O-])=O'), ('Cysteine',...",[chemicalmodificationschickenliverpyruvatecarb...
3,10.1006/abbi.1993.1275,3,2552,2895,"ash de temple univ, hlth sci ctr, sch med, dep...","[('Ash', 'NNP'), ('D.E.', 'NNP'), ('Temple', '...",carboxylase,"[('carboxylase', 32, 32)]","[{'text': 'pyruvate', 'start': 2696, 'end': 27...","[cysteine, n-(7-dimethylamino-4-methyl-3-couma...","CC(=O)C([O-])=O,, CN(C)c1ccc2C(=C(N3C(=O)C=CC3...","[('pyruvate', 'CC(=O)C([O-])=O'), ('N-(7-dimet...","[ash de temple univ, hlth sci ctr, sch med, de..."
4,10.1006/abbi.1993.1275,5,3008,3317,at a one- to two-fold molar excess over active...,"[('At', 'IN'), ('a', 'DT'), ('one-', 'CD'), ('...","carboxylase, decarboxylase","[('carboxylase', 20, 20), ('decarboxylase', 28...","[{'text': 'pyruvate', 'start': 3120, 'end': 31...","[pyruvate, oxaloacetate, adp]","CC(=O)C([O-])=O,, Nc1ncnc2n(cnc12)C3OC(CO[P](O...","[('pyruvate', 'CC(=O)C([O-])=O'), ('ADP', 'Nc1...",[at a one- to two-fold molar excess over activ...


In [81]:
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import MajorityLabelVoter

#lfs = [add your list of labeling functions]
#df = df of interest

applier = PandasLFApplier(lfs=all_lfs)
L_train = applier.apply(df=df)

majority_model = MajorityLabelVoter()
df["label_voter"] = majority_model.predict(L=L_train)

ValueError: Operator names not unique: 2 operators with name structure_sep_via

In [137]:
# Define the set of labeling functions (LFs)
# currently excluding amino_acid and followed_ase and followed_by_noun
lfs = [includes_solution_words, includes_mixture_words, includes_physical_words, includes_genetic_words, includes_structural_words,
      includes_general_chemical_words, includes_functional_group, includes_paper_artifacts, includes_no_terms, structure_adjacent_mentions,
      structure_sep_or, structure_sep_comma, structure_sep_via, structure_sep_sym, structure_sep_adverb, includes_oxidation_words,
      structure_sep_verb, structure_sep_conversion_words, includes_combustion_words, includes_neutralization_words, includes_combination_words,
      includes_decomposition_words, includes_replacement_words, includes_reaction_words, includes_reaction_component_words, includes_comparison_words,
      includes_one_chem, includes_react, includes_measure_words, includes_experiment_words, includes_concentration, structure_sep_and,
      structure_next_to_conversion_words, structure_conversion_by, structure_conversion_of, includes_react_sym,
      includes_interconvert_sym, structure_jtsui_pattern_1, structure_jtsui_pattern_2,
      structure_jtsui_pattern_3, structure_jtsui_pattern_4, structure_jtsui_pattern_5, structure_jtsui_pattern_1_copy, structure_jtsui_pattern_2_copy,
      structure_jtsui_pattern_3_copy, structure_jtsui_pattern_4_copy, structure_jtsui_pattern_5_copy,
      includes_impact_words, includes_creation_words, includes_catalyze_words, includes_emzyze_words, includes_reaction_words2]
      #  , ]
      #includes_reaction_words5, includes_reaction_words3, includes_reaction_words4, structure_followed_by_ase,
df_train = brenda_mult_chem_df
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

  from pandas import Panel
100%|██████████| 44724/44724 [15:45<00:00, 47.32it/s] 


In [138]:
L_train[1]

array([-1, -1, -1,  0,  0, -1, -1,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  0, -1, -1, -1,  1, -1,
        1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1,  1, -1,
        1])

In [139]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
includes_solution_words,0,[0],0.078616,0.078504,0.07515
includes_mixture_words,1,[0],0.028419,0.028396,0.026541
includes_physical_words,2,[0],0.457607,0.454834,0.41025
includes_genetic_words,3,[0],0.277457,0.277189,0.252392
includes_structural_words,4,[0],0.309141,0.307866,0.275065
includes_general_chemical_words,5,[0],0.238574,0.237635,0.213778
includes_functional_group,6,[0],0.262141,0.26147,0.236964
includes_paper_artifacts,7,[0],0.133396,0.133105,0.124989
includes_no_terms,8,[0],0.135475,0.134022,0.119354
structure_adjacent_mentions,9,[0],0.103859,0.103859,0.093976


In [140]:
LFAnalysis(L_train).label_coverage()

0.9940747696985959

In [141]:
# gets the average amount of labels (goal is to get this to 15)
sum = 0
total = 0
for item in L_train:
    sum += 41 - np.count_nonzero(item == -1)
    total += 1
sum/total


-4.902110723548877

In [157]:
majority_model = MajorityLabelVoter()
brenda_mult_chem_df["label_voter"] = majority_model.predict(L=L_train)

In [158]:
brenda_mult_chem_df.loc[brenda_mult_chem_df["label_voter"] == -1, "label_voter"] = 1

brenda_mult_chem_df

Unnamed: 0,lit_id,indices,start,end,sentence,sentence_pos,enzymes,enzyme_locations,chemical_entities_full,chemical_names,chemical_smiles,name_smile_tuples,text,label_voter
0,10.1002/jps.20686,150,33180,33589,"we selected diltiazem, cyclosporin a, and dexa...","[('We', 'PRP'), ('selected', 'VBD'), ('diltiaz...",ATPase,"[('ATPase', 22, 22)]","[{'text': 'diltiazem', 'start': 33192, 'end': ...","[cyclosporin a, diltiazem, dexamethasone, calc...",COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...,"[('diltiazem', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(...","[we selected diltiazem, cyclosporin a, and dex...",0
1,10.1006/abbi.1993.1114,3,2464,2912,"croteau r washington state univ, inst biol che...","[('Croteau', 'NNP'), ('R.', 'NNP'), ('Washingt...","synthase, cyclase","[('synthase', 34, 34), ('cyclase', 38, 38)]","[{'text': 'Limonene', 'start': 2620, 'end': 26...","[limonene, sodium dodecyl sulfate, geranyl pyr...","CC(=C)C1CCC(=CC1)C,, CC(C)=CCCC(/C)=C/CO[P]([O...","[('Limonene', 'CC(=C)C1CCC(=CC1)C'), ('geranyl...","[croteau r washington state univ, inst biol ch...",1
2,10.1006/abbi.1993.1275,2,1178,2551,chemicalmodificationschickenliverpyruvatecarbo...,[('CHEMICALMODIFICATIONSCHICKENLIVERPYRUVATECA...,Carboxylase,"[('Carboxylase', 143, 143)]","[{'text': 'Pyruvate', 'start': 2438, 'end': 24...","[lysine, pyruvate, cysteine]","CC(=O)C([O-])=O,, N[C@@H](CS)C(O)=O,, NCCCC[C@...","[('Pyruvate', 'CC(=O)C([O-])=O'), ('Cysteine',...",[chemicalmodificationschickenliverpyruvatecarb...,0
3,10.1006/abbi.1993.1275,3,2552,2895,"ash de temple univ, hlth sci ctr, sch med, dep...","[('Ash', 'NNP'), ('D.E.', 'NNP'), ('Temple', '...",carboxylase,"[('carboxylase', 32, 32)]","[{'text': 'pyruvate', 'start': 2696, 'end': 27...","[cysteine, n-(7-dimethylamino-4-methyl-3-couma...","CC(=O)C([O-])=O,, CN(C)c1ccc2C(=C(N3C(=O)C=CC3...","[('pyruvate', 'CC(=O)C([O-])=O'), ('N-(7-dimet...","[ash de temple univ, hlth sci ctr, sch med, de...",0
4,10.1006/abbi.1993.1275,5,3008,3317,at a one- to two-fold molar excess over active...,"[('At', 'IN'), ('a', 'DT'), ('one-', 'CD'), ('...","carboxylase, decarboxylase","[('carboxylase', 20, 20), ('decarboxylase', 28...","[{'text': 'pyruvate', 'start': 3120, 'end': 31...","[pyruvate, oxaloacetate, adp]","CC(=O)C([O-])=O,, Nc1ncnc2n(cnc12)C3OC(CO[P](O...","[('pyruvate', 'CC(=O)C([O-])=O'), ('ADP', 'Nc1...",[at a one- to two-fold molar excess over activ...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44719,10.1263/jbb.99.623,26,11239,11493,key words d-alanine-d-alanine ligase d-amino a...,"[('Key', 'JJ'), ('words', 'NNS'), ('D-alanine-...",ligase,"[('ligase', 3, 3)]","[{'text': 'D-alanine-D-alanine', 'start': 1124...","[vancomycin, d-alanine-d-alanine]","C[C@@H](N)C(O)=O.C[C@@H](N)C(O)=O,, CN[C@H](CC...","[('D-alanine-D-alanine', 'C[C@@H](N)C(O)=O.C[C...",[key words d-alanine-d-alanine ligase d-amino ...,0
44720,10.1263/jbb.99.623,62,17119,17420,the mobile phase consisted of a linear gradien...,"[('The', 'DT'), ('mobile', 'JJ'), ('phase', 'N...",phase,"[('phase', 2, 2)]","[{'text': 'methanol', 'start': 17213, 'end': 1...","[tetrahydrofurane, methanol]","CO,, CO,, C1CCOC1","[('methanol', 'CO'), ('methanol', 'CO'), ('tet...",[the mobile phase consisted of a linear gradie...,0
44721,10.1263/jbb.99.623,129,27150,27373,weber v falkenhagen d subpol a novel sucrose-b...,"[('Weber', 'NNP'), ('V.', 'NNP'), ('Falkenhage...",phase,"[('phase', 16, 16)]","[{'text': 'sucrose', 'start': 27190, 'end': 27...","[kato, sucrose]",OC[C@H]1O[C@H](O[C@]2(CO)O[C@H](CO)[C@@H](O)[C...,"[('sucrose', 'OC[C@H]1O[C@H](O[C@]2(CO)O[C@H](...",[weber v falkenhagen d subpol a novel sucrose-...,0
44722,10.1263/jbb.99.623,138,28139,28364,active-site mutants of the vanc2 d-alanyl-d-se...,"[('Active', 'JJ'), ('-', 'HYPH'), ('site', 'NN...","ligase, ligase","[('ligase', 8, 8), ('ligase', 25, 25)]","[{'text': 'vancomycin', 'start': 28220, 'end':...","[vancomycin, d-alanyl-d-alanine]",CN[C@H](CC(C)C)C(=O)NC1[C@H](O)c2ccc(Oc3cc4cc(...,"[('vancomycin', 'CN[C@H](CC(C)C)C(=O)NC1[C@H](...",[active-site mutants of the vanc2 d-alanyl-d-s...,0


In [143]:
#oof

df_brenda_p = df_train[df_train["label_voter"] == 1]

len(df_brenda_p)

11278

In [159]:
brenda_mult_chem_df.to_csv('voter_elsevier_split_6.csv')

In [144]:
display(df_brenda_p)

Unnamed: 0,sentence,chemical_names,text,label_voter
1,"croteau r washington state univ, inst biol che...","[limonene, sodium dodecyl sulfate, geranyl pyr...","[croteau r washington state univ, inst biol ch...",1
7,biosynthesisvicinaldihydroxyfattyacidsinredalg...,"[hydroperoxide, sodium]",[biosynthesisvicinaldihydroxyfattyacidsinredal...,1
8,the transformation consisted of initial 12-lip...,"[arachidonic acid, hydroperoxide]",[the transformation consisted of initial 12-li...,1
9,the 12-lipoxygenase-catalyzed reaction consist...,"[hydrogen, hydroperoxide, oxygen, carbon]",[the 12-lipoxygenase-catalyzed reaction consis...,1
10,subsequent conversion of the product to car-3-...,"[(+)-3-carene, car-3-en-5-one]",[subsequent conversion of the product to car-3...,1
...,...,...,...,...
44699,"nhase exhibited an absorption peak at 710 nm, ...","[sulfur, fe]","[nhase exhibited an absorption peak at 710 nm,...",1
44702,concluding remarks the results indicate that f...,"[fe, isonitrile]",[concluding remarks the results indicate that ...,1
44703,nitrogenase is reported to have an activity th...,"[methylisonitrile, methylamine]",[nitrogenase is reported to have an activity t...,1
44704,others have reported that two enzymes are invo...,"[formamide, n, isonitrile]",[others have reported that two enzymes are inv...,1


In [174]:
n_examples = 2

goal = np.arange(120).reshape(2,3,4,5)

print(goal)

print(goal.transpose(3,2,0,1))
      
print(goal.transpose(3,2,0,1).reshape(5, 24))

forward = np.arange(120)

print(forward)

forward = np.arange(120).reshape(3,-1)

print(forward)

forward_batch = forward.reshape(-1, n_examples).T

forward_batch


[[[[  0   1   2   3   4]
   [  5   6   7   8   9]
   [ 10  11  12  13  14]
   [ 15  16  17  18  19]]

  [[ 20  21  22  23  24]
   [ 25  26  27  28  29]
   [ 30  31  32  33  34]
   [ 35  36  37  38  39]]

  [[ 40  41  42  43  44]
   [ 45  46  47  48  49]
   [ 50  51  52  53  54]
   [ 55  56  57  58  59]]]


 [[[ 60  61  62  63  64]
   [ 65  66  67  68  69]
   [ 70  71  72  73  74]
   [ 75  76  77  78  79]]

  [[ 80  81  82  83  84]
   [ 85  86  87  88  89]
   [ 90  91  92  93  94]
   [ 95  96  97  98  99]]

  [[100 101 102 103 104]
   [105 106 107 108 109]
   [110 111 112 113 114]
   [115 116 117 118 119]]]]
[[[[  0  20  40]
   [ 60  80 100]]

  [[  5  25  45]
   [ 65  85 105]]

  [[ 10  30  50]
   [ 70  90 110]]

  [[ 15  35  55]
   [ 75  95 115]]]


 [[[  1  21  41]
   [ 61  81 101]]

  [[  6  26  46]
   [ 66  86 106]]

  [[ 11  31  51]
   [ 71  91 111]]

  [[ 16  36  56]
   [ 76  96 116]]]


 [[[  2  22  42]
   [ 62  82 102]]

  [[  7  27  47]
   [ 67  87 107]]

  [[ 12  32  52]
   [

array([[  0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,
         26,  28,  30,  32,  34,  36,  38,  40,  42,  44,  46,  48,  50,
         52,  54,  56,  58,  60,  62,  64,  66,  68,  70,  72,  74,  76,
         78,  80,  82,  84,  86,  88,  90,  92,  94,  96,  98, 100, 102,
        104, 106, 108, 110, 112, 114, 116, 118],
       [  1,   3,   5,   7,   9,  11,  13,  15,  17,  19,  21,  23,  25,
         27,  29,  31,  33,  35,  37,  39,  41,  43,  45,  47,  49,  51,
         53,  55,  57,  59,  61,  63,  65,  67,  69,  71,  73,  75,  77,
         79,  81,  83,  85,  87,  89,  91,  93,  95,  97,  99, 101, 103,
        105, 107, 109, 111, 113, 115, 117, 119]])

In [170]:
for i in range(10000):
    if L_train[df_train.index[i]][0] == 0:
        print(df_train.iloc[i]['sentence'])
        print('')

both the p furiosus and yeast enzymes required a metal ion for activity, but whereas the yeast enzyme has an absolute requirement for mg2+, the p furiosus enolase was equally active in the presence of mn2+

enzymeactivityenzymaticactivityonhalogenatedcompoundswasmeasuredbychloridereleasedetectionasdescribed 13afterincubationat30â°cfor30minintabuffertris-acetate01mph85supplementedwith2mmmcaactivity of deae-bound dehalogenase in standard conditions was detected after removal of the dehalogenase-deae-sephacel complex by centrifugation, and quantitation of chloride in the supernatant

the activity of the 2-haloacid dehalogenase at different phs was inspected using 01 m acetate ph 3â€“7, 01 m tris-acetate ph 7â€“10 and 01 m sodium bicarbonate ph 10â€“12 buffers to attain acidic, neutral and basic buffered environment in the assay, respectively

inductionhemeoxygenase1modulatescisaconitaseactivityinlensepithelialcells rzymkiewicz d keyse 1989 99 103 s cruse 1988 3348 3353 i keyse 1987 14821 


inhibition by the specific sulphydryl group reagent 5,5'-dithiobis2-nitrobenzoic acid was tested by incubating epimerase with an excess of the reagent for 60 min at 30 in the 40 mm triethanolamine/chloride/2 mm edta buffer, ph 80

a linear gradient of 50 ml of 05 m kcl in the above buffer running into 50 ml of buffer was begun at tube 26 and followed by 05 m kci in buffer , absorbance at 280 nn in a 1 cm cell a28p 0------o, ribulose-5-phosphate 3-epimerase activity - - - - - kcl concentration

a linear gradient of 40 ml of 01 m kcl in buffer running into 40 ml of buffer was begun at tube 10 and followed by 05 m kcl in buffer at tube 30 , absorbance at 280 nm in a 1 cm cell az gp 0 - - _ _ _ _o, ribulose-5-phosphate 3-epixnerase activity - - - - kcl concentration

except where stated otherwise, lyase activity was determined by isotopic assay using ring-labelled l-phenylalanine and a 20 h incubation 25 as described by attridge et al [9] except that 100 mm tris-hcl ph 86 replaced the bor

In [165]:
oof = 101

for i in range(100):
    negs = []
    neg_count = 0
    pos = []
    pos_count = 0
    
    print(df_brenda_p.index[oof])
    print('')
    
    for j in range(len(L_train[i])):
        if L_train[df_brenda_p.index[oof]][j] == 0:
            negs.append(str(lfs[j].name))
            neg_count = neg_count + 1
        if L_train[df_brenda_p.index[oof]][j] == 1:
            pos.append(str(lfs[j].name))
            pos_count = pos_count + 1
    print("SENTENCE:")
    print(df_brenda_p.iloc[oof]['sentence'])
    print('')
    print("CHEMICALS:")
    print(df_brenda_p.iloc[oof]['chemical_names'])
    print("")
    oof = oof + 1
    print("LABELS:")
    print(L_train[df_brenda_p.index[oof]])
    print('')
    print("NEGS: " + str(neg_count) + " POS: " + str(pos_count))
    print('')
    print(negs)
    print('')
    print(pos)
    print('')

476

SENTENCE:
the catalytic mechanism of ligases proceeds via a number of steps beginning with the hydrolysis of atp or nad+ to yield amp covalently attached to the active site lysine residue i, with the release of pyrophosphate or nicotinamide mononucleotide

CHEMICALS:
['nad+', 'nicotinamide mononucleotide', 'lysine', 'amp', 'atp', 'pyrophosphate']

LABELS:
[-1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1  1]

NEGS: 5 POS: 6

['includes_structural_words', 'includes_general_chemical_words', 'includes_functional_group', 'structure_sep_or', 'includes_measure_words']

['includes_reaction_words', 'includes_reaction_component_words', 'structure_next_to_conversion_words', 'structure_jtsui_pattern_1', 'structure_jtsui_pattern_1_copy', 'includes_reaction_words2']

481

SENTENCE:
this gmp-adduct is equivalent to the amp-adduct formed upon reaction of ligase with atp

CHEMICALS:
['amp', '


868

SENTENCE:
ornithine decarboxylase odc, ec 41117 performs the first step in polyamine biosynthesis, the decarboxylation of ornithine to putrescine

CHEMICALS:
['putrescine', 'ornithine']

LABELS:
[-1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1
  1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1 -1  1]

NEGS: 1 POS: 6

['includes_functional_group']

['structure_sep_conversion_words', 'includes_reaction_words', 'includes_reaction_component_words', 'structure_next_to_conversion_words', 'structure_conversion_of', 'includes_reaction_words2']

870

SENTENCE:
ornithine decarboxylase odc catalyzes the first and rate-limiting step in polyamine biosynthesis, the decarboxylation of ornithine to putrescine

CHEMICALS:
['putrescine', 'ornithine']

LABELS:
[-1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
 -1 -1 -1 -1 -1  0 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1  1]

NEGS: 1 POS: 7

['includes_f

In [1]:
df_train

NameError: name 'df_train' is not defined

# DEBUGGING MAJORITY LABEL VOTER

## The following code should print out the fps and fns and other useful info

In [71]:
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling.model import LabelModel

In [96]:
# Reset indices so that majority label voter's predict fn aligns in the df

df_test = hand_anno_mult_chem_df
df_test = df_test.reset_index(drop=True)
display(df_test)

Unnamed: 0,sentence,chemicals,truth,substrates,products,text,label_voter
0,the enzyme cyclo-oxygenase catalyses the oxyge...,"[prostaglandins, arachidonic acid]",1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...,1
1,although the exact site of decarboxylation of ...,"[levodopa, dopamine]",1,,,[although the exact site of decarboxylation of...,-1
2,the enzyme dimethylarginine dimethylaminohydro...,"[citrulline, arginine, methylamines, dimethyla...",1,arginine,,[the enzyme dimethylarginine dimethylaminohydr...,-1
3,background and aims glutamic acid decarboxylas...,"[gaba, glutamate]",1,glutamate,"gamma-aminobutyric, acid, GABA",[background and aims glutamic acid decarboxyla...,1
4,"mazindane 26 was found to be a pro-drug, oxidi...","[5-h, mazindol, mazindane, 5-oh]",1,,,"[mazindane 26 was found to be a pro-drug, oxid...",1
...,...,...,...,...,...,...,...
418,only two exceptions had been reported-the ente...,"[n-alpha-acetylornithine, ornithine]",1,ornithine,N-alpha-acetylornithine,[only two exceptions had been reported-the ent...,1
419,the energetics is here reported for the action...,"[l-beta-lysine, l-lysine]",1,"l-lysine, l-beta-lysine","l-lysine, l-beta-lysine",[the energetics is here reported for the actio...,1
420,estradiol 17 beta-hydroxysteroid dehydrogenase...,"[estrone, estradiol]",1,"estrone, estradiol","estrone, estradiol",[estradiol 17 beta-hydroxysteroid dehydrogenas...,1
421,5-hydroxymethyltryptophan and 5-hydroxy-4-meth...,"[5-hydroxy-4-methyltryptophan, 5-methyltryptop...",1,5-methyltryptophan,"5-Hydroxymethyltryptophan, 5-hydroxy-4-methylt...",[5-hydroxymethyltryptophan and 5-hydroxy-4-met...,1


In [97]:
# Define the set of labeling functions (LFs)
# currently excluding amino_acid and followed_ase and followed_by_noun

# removing physical_words increases recall but causes large drop in precision
# sep_conversion_word and sep_verb removal increase precision to 0.71 with recall at 0.38
# Apply the LFs to the unlabeled training data
lfs = [includes_solution_words, includes_mixture_words, includes_physical_words, includes_genetic_words, includes_structural_words,
      includes_general_chemical_words, includes_functional_group, includes_paper_artifacts, includes_no_terms, structure_adjacent_mentions,
      structure_sep_or, structure_sep_comma, structure_sep_via, structure_sep_sym, structure_sep_adverb, includes_oxidation_words,
      structure_sep_verb, structure_sep_conversion_words, includes_combustion_words, includes_neutralization_words, includes_combination_words,
      includes_decomposition_words, includes_replacement_words, includes_reaction_words, includes_reaction_component_words, includes_comparison_words,
      includes_one_chem, includes_react, includes_measure_words, includes_experiment_words, includes_concentration, structure_sep_and,
      structure_next_to_conversion_words, structure_conversion_by, structure_conversion_of, includes_react_sym,
      includes_interconvert_sym, structure_jtsui_pattern_1, structure_jtsui_pattern_2,
      structure_jtsui_pattern_3, structure_jtsui_pattern_4, structure_jtsui_pattern_5, structure_jtsui_pattern_1_copy, structure_jtsui_pattern_2_copy,
      structure_jtsui_pattern_3_copy, structure_jtsui_pattern_4_copy, structure_jtsui_pattern_5_copy,
      includes_impact_words, includes_creation_words, includes_catalyze_words, includes_emzyze_words, includes_reaction_words2]
      #  , ]
      #includes_reaction_words5, includes_reaction_words3, includes_reaction_words4, structure_followed_by_ase,

applier_test = PandasLFApplier(lfs=lfs)
L_test = applier_test.apply(df=df_test)

  from pandas import Panel
100%|██████████| 423/423 [00:08<00:00, 49.07it/s]


In [99]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_test, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
includes_solution_words,0,[0],0.018913,0.018913,0.018913
includes_mixture_words,1,[],0.0,0.0,0.0
includes_physical_words,2,[0],0.283688,0.283688,0.198582
includes_genetic_words,3,[0],0.179669,0.179669,0.134752
includes_structural_words,4,[0],0.179669,0.177305,0.134752
includes_general_chemical_words,5,[0],0.186761,0.184397,0.125296
includes_functional_group,6,[0],0.307329,0.307329,0.229314
includes_paper_artifacts,7,[0],0.042553,0.042553,0.030733
includes_no_terms,8,[0],0.295508,0.286052,0.193853
structure_adjacent_mentions,9,[0],0.120567,0.120567,0.066194


In [100]:
LFAnalysis(L_test).label_coverage()

0.9952718676122931

In [101]:
majority_model = MajorityLabelVoter()
df_test["label_voter"] = majority_model.predict(L=L_test)

In [102]:
display(majority_model.predict(L=L_test))
df_test.index

array([ 1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  0,  1,  1,  1,  1,  1,
        1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  0,
        1,  0,  1,  0,  0,  0,  0,  0,  0, -1,  1, -1,  0,  0, -1, -1,  0,
       -1,  0,  1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,
        0, -1,  0,  0,  0,  0,  0, -1,  0,  0,  1,  1, -1,  0,  0,  0,  0,
        0,  1,  0,  1,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        0,  0,  0,  0, -1,  0,  0, -1,  1,  0,  1, -1,  0, -1,  0,  1,  0,
        1,  1,  0,  0,  0,  0,  0, -1,  1,  1,  0, -1,  0,  0,  0,  0,  0,
       -1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  1, -1,  1,  1,  0,  0,
        0, -1,  0, -1,  0,  0,  0,  1,  1,  0,  0, -1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  1,  0,  1,  0,  0,  1,  0,  0,  0,  0,  1,  0,  1,
        0,  0,  0,  0,  0

RangeIndex(start=0, stop=423, step=1)

In [103]:
df_test

Unnamed: 0,sentence,chemicals,truth,substrates,products,text,label_voter
0,the enzyme cyclo-oxygenase catalyses the oxyge...,"[prostaglandins, arachidonic acid]",1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...,1
1,although the exact site of decarboxylation of ...,"[levodopa, dopamine]",1,,,[although the exact site of decarboxylation of...,-1
2,the enzyme dimethylarginine dimethylaminohydro...,"[citrulline, arginine, methylamines, dimethyla...",1,arginine,,[the enzyme dimethylarginine dimethylaminohydr...,-1
3,background and aims glutamic acid decarboxylas...,"[gaba, glutamate]",1,glutamate,"gamma-aminobutyric, acid, GABA",[background and aims glutamic acid decarboxyla...,1
4,"mazindane 26 was found to be a pro-drug, oxidi...","[5-h, mazindol, mazindane, 5-oh]",1,,,"[mazindane 26 was found to be a pro-drug, oxid...",1
...,...,...,...,...,...,...,...
418,only two exceptions had been reported-the ente...,"[n-alpha-acetylornithine, ornithine]",1,ornithine,N-alpha-acetylornithine,[only two exceptions had been reported-the ent...,1
419,the energetics is here reported for the action...,"[l-beta-lysine, l-lysine]",1,"l-lysine, l-beta-lysine","l-lysine, l-beta-lysine",[the energetics is here reported for the actio...,1
420,estradiol 17 beta-hydroxysteroid dehydrogenase...,"[estrone, estradiol]",1,"estrone, estradiol","estrone, estradiol",[estradiol 17 beta-hydroxysteroid dehydrogenas...,1
421,5-hydroxymethyltryptophan and 5-hydroxy-4-meth...,"[5-hydroxy-4-methyltryptophan, 5-methyltryptop...",1,5-methyltryptophan,"5-Hydroxymethyltryptophan, 5-hydroxy-4-methylt...",[5-hydroxymethyltryptophan and 5-hydroxy-4-met...,1


In [104]:
#oof

len(df_test[df_test["truth"] == 1])

60

In [105]:
df_test[df_test["label_voter"] == 1]

Unnamed: 0,sentence,chemicals,truth,substrates,products,text,label_voter
0,the enzyme cyclo-oxygenase catalyses the oxyge...,"[prostaglandins, arachidonic acid]",1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...,1
3,background and aims glutamic acid decarboxylas...,"[gaba, glutamate]",1,glutamate,"gamma-aminobutyric, acid, GABA",[background and aims glutamic acid decarboxyla...,1
4,"mazindane 26 was found to be a pro-drug, oxidi...","[5-h, mazindol, mazindane, 5-oh]",1,,,"[mazindane 26 was found to be a pro-drug, oxid...",1
5,"interestingly, the allele of pro1 was shown to...","[gamma-glutamyl, l-proline, l-glutamate]",1,L-glutamate,L-proline,"[interestingly, the allele of pro1 was shown t...",1
6,in addition to the racemization it also cataly...,"[l-ser, pyruvate]",1,,,[in addition to the racemization it also catal...,1
...,...,...,...,...,...,...,...
418,only two exceptions had been reported-the ente...,"[n-alpha-acetylornithine, ornithine]",1,ornithine,N-alpha-acetylornithine,[only two exceptions had been reported-the ent...,1
419,the energetics is here reported for the action...,"[l-beta-lysine, l-lysine]",1,"l-lysine, l-beta-lysine","l-lysine, l-beta-lysine",[the energetics is here reported for the actio...,1
420,estradiol 17 beta-hydroxysteroid dehydrogenase...,"[estrone, estradiol]",1,"estrone, estradiol","estrone, estradiol",[estradiol 17 beta-hydroxysteroid dehydrogenas...,1
421,5-hydroxymethyltryptophan and 5-hydroxy-4-meth...,"[5-hydroxy-4-methyltryptophan, 5-methyltryptop...",1,5-methyltryptophan,"5-Hydroxymethyltryptophan, 5-hydroxy-4-methylt...",[5-hydroxymethyltryptophan and 5-hydroxy-4-met...,1


In [106]:
df_false = df_test[df_test["truth"] == 0]
df_fp = df_false[df_false["label_voter"] == 1]

df_fp

Unnamed: 0,sentence,chemicals,truth,substrates,products,text,label_voter
53,omega class gsts have dehydroascorbate reducta...,"[monomethylarsonate, arsenic]",0,monomethylarsonate,,[omega class gsts have dehydroascorbate reduct...,1
61,n-acetylglucosamine-1-phosphodiester alpha-n-a...,"[n-acetylglucosamine-1-phosphodiester, phospho...",0,,"mannose, 6-phosphate",[n-acetylglucosamine-1-phosphodiester alpha-n-...,1
70,"cgmp inhibited hydrolysis of camp, and camp in...","[cgmp, camp]",0,,,"[cgmp inhibited hydrolysis of camp, and camp i...",1
84,on phosphorylation of ser40 by protein kinase ...,"[h4biopterin, ser40]",0,Ser40,,[on phosphorylation of ser40 by protein kinase...,1
88,"thus, our studies provide new insight into the...","[catecholamine, h4biopterin]",0,,,"[thus, our studies provide new insight into th...",1
112,only in the last couple of years a consensus h...,"[bh4, no]",0,,,[only in the last couple of years a consensus ...,1
113,the nos-catalyzed oxidation of nadph in the ab...,"[nadph, no]",0,,,[the nos-catalyzed oxidation of nadph in the a...,1
120,"conclusions in the maximum registered dosage, ...","[nabumetone, meloxicam, thromboxane]",0,,,"[conclusions in the maximum registered dosage,...",1
122,methods studies were performed on enzymatic ac...,"[3h-gaba, 3h-glutamate, plp]",0,,,[methods studies were performed on enzymatic a...,1
124,the results reinforce previous assumptions tha...,"[eicosanoid, dopamine]",0,,,[the results reinforce previous assumptions th...,1


In [107]:
L_test[51]

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  0, -1, -1,  1, -1,
       -1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
        1])

In [108]:
df_true = df_test[df_test["truth"] == 1]
df_fn = df_true[df_true["label_voter"] == 0]

df_fn

Unnamed: 0,sentence,chemicals,truth,substrates,products,text,label_voter
26,"metabolism of vitamin a, all-trans-retinol, le...","[11-cis-retinaldehyde, all-trans-retinol]",1,,,"[metabolism of vitamin a, all-trans-retinol, l...",0
28,"unlike mouse retsat mretsat, zretsat a had an ...","[all-trans-13,14-dihydroretinol, all-trans-7,8...",1,all-trans-retinol,"all-trans-13,14-dihydroretinol, all-trans-7,8-...","[unlike mouse retsat mretsat, zretsat a had an...",0
35,tas-102 is a novel drug containing trifluoroth...,"[tas-102, trifluorothymidine]",1,"TAS-102, trifluorothymidine",,[tas-102 is a novel drug containing trifluorot...,0
44,"these findings, derived from a variety of anal...","[no, peroxynitrite, superoxide, androgen, xant...",1,,"peroxynitrite, superoxide, NO","[these findings, derived from a variety of ana...",0
50,pip5k1b encodes phosphatidylinositol 4-phospha...,"[phosphatidylinositol 4-phosphate, phosphatidy...",1,"phosphatidylinositol, 4-phosphate, PI","phosphatidylinositol-4,5-bisphosphate, PI",[pip5k1b encodes phosphatidylinositol 4-phosph...,0


In [109]:
df_tp = df_true[df_true['label_voter'] == 1]

In [110]:
oof = 0
for i in df_tp.index:
    negs = []
    neg_count = 0
    pos = []
    pos_count = 0
    
    for j in range(len(L_test[i])):
        if L_test[i][j] == 0:
            negs.append(str(lfs[j].name))
            neg_count = neg_count + 1
        if L_test[i][j] == 1:
            pos.append(str(lfs[j].name))
            pos_count = pos_count + 1
    print("SENTENCE:")
    print(df_tp.iloc[oof]['sentence'])
    print('')
    print("CHEMICALS:")
    print(df_tp.iloc[oof]['chemicals'])
    print("")
    oof = oof + 1
    print("LABELS:")
    print(L_test[i])
    print('')
    print("NEGS: " + str(neg_count) + " POS: " + str(pos_count))
    print('')
    print(negs)
    print('')
    print(pos)
    print('')

SENTENCE:
the enzyme cyclo-oxygenase catalyses the oxygenation of arachidonic acid, leading to the formation of prostaglandins

CHEMICALS:
['prostaglandins', 'arachidonic acid']

LABELS:
[-1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
 -1  1  1  1]

NEGS: 1 POS: 7

['includes_general_chemical_words']

['includes_oxidation_words', 'includes_reaction_words', 'structure_conversion_of', 'includes_impact_words', 'includes_catalyze_words', 'includes_emzyze_words', 'includes_reaction_words2']

SENTENCE:
background and aims glutamic acid decarboxylase gad, ec 41115 catalyses the conversion of glutamate to gamma-aminobutyric acid gaba

CHEMICALS:
['gaba', 'glutamate']

LABELS:
[-1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
 -1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1  1 -1 -1
 -1  1 -1  1]

NEGS: 1 POS: 7

['includes_physical_words']

['includes_reaction

In [111]:
for sentence in df_true["sentence"]:
    print(sentence)
    print("")

the enzyme cyclo-oxygenase catalyses the oxygenation of arachidonic acid, leading to the formation of prostaglandins

although the exact site of decarboxylation of exogenous levodopa to dopamine in the brain is unknown, most striatal aadc is located in nigrostriatal dopaminergic nerve terminals

the enzyme dimethylarginine dimethylaminohydrolase ddah specifically hydrolyzes these asymmetrically methylated arginine residues to citrulline and methylamines

background and aims glutamic acid decarboxylase gad, ec 41115 catalyses the conversion of glutamate to gamma-aminobutyric acid gaba

mazindane 26 was found to be a pro-drug, oxidizing 5-h --> 5-oh to mazindol on rat striatal membranes and hek-hdat cells

interestingly, the allele of pro1 was shown to enhance the activities of gamma-glutamyl kinase and gamma-glutamyl phosphate reductase, both of which catalyze the first two steps of l-proline synthesis from l-glutamate and which together may form a complex in vivo

in addition to the ra

In [112]:
for sentence in df_fn["sentence"]:
    print(sentence)
    print("")
    
#to_csv

metabolism of vitamin a, all-trans-retinol, leads to the formation of 11-cis-retinaldehyde, the visual chromophore, and all-trans-retinoic acid, which is involved in the regulation of gene expression through the retinoic acid receptor

unlike mouse retsat mretsat, zretsat a had an altered bond specificity saturating either the 13-14 or 7-8 double bonds of all-trans-retinol to produce either all-trans-13,14-dihydroretinol or all-trans-7,8-dihydroretinol, respectively

tas-102 is a novel drug containing trifluorothymidine, which is phosphorylated by tk-1 to its active monophosphated form, that in turn can inhibit ts

these findings, derived from a variety of analytical and functional approaches, provide evidence for a novel nongenomic signaling mechanism for androgen action in the microvasculature tes-stimulated vasodilation mediated primarily by peroxynitrite formed from xanthine oxidase-generated superoxide and no

pip5k1b encodes phosphatidylinositol 4-phosphate 5-kinase beta type i p

In [113]:
oof = 0
for i in df_fn.index:
    negs = []
    pos = []
    
    for j in range(len(L_test[i])):
        if L_test[i][j] == 0:
            negs.append(str(lfs[j].name))
        if L_test[i][j] == 1:
            pos.append(str(lfs[j].name))
    
    print(df_fn.iloc[oof]['sentence'])
    print('')
    print(df_fn.iloc[oof]['chemicals'])
    print("")
    oof = oof + 1
    print(L_test[i])
    print('')
    print(negs)
    print('')
    print(pos)
    print('')

#for each sentence, find indices for all 0 labels and 1 labels, find corresponding LFs,
#make df? 

metabolism of vitamin a, all-trans-retinol, leads to the formation of 11-cis-retinaldehyde, the visual chromophore, and all-trans-retinoic acid, which is involved in the regulation of gene expression through the retinoic acid receptor

['11-cis-retinaldehyde', 'all-trans-retinol']

[-1 -1  0  0  0 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
 -1 -1 -1 -1  0 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
 -1 -1 -1  1]

['includes_physical_words', 'includes_genetic_words', 'includes_structural_words', 'includes_functional_group', 'includes_measure_words']

['includes_reaction_words', 'structure_conversion_of', 'includes_impact_words', 'includes_reaction_words2']

unlike mouse retsat mretsat, zretsat a had an altered bond specificity saturating either the 13-14 or 7-8 double bonds of all-trans-retinol to produce either all-trans-13,14-dihydroretinol or all-trans-7,8-dihydroretinol, respectively

['all-trans-13,14-dihydroretinol', 'all-trans-7,8-dihydroretinol', 'all-

In [114]:
oof = 0
for i in df_fp.index:
    negs = []
    pos = []
    
    for j in range(len(L_test[i])):
        if L_test[i][j] == 0:
            negs.append(str(lfs[j].name))
        if L_test[i][j] == 1:
            pos.append(str(lfs[j].name))
    
    print(df_fp.iloc[oof]['sentence'])
    print('')
    print(df_fp.iloc[oof]['chemicals'])
    print('')
    oof = oof + 1
    print(L_test[i])
    print('')
    print(negs)
    print('')
    print(pos)
    print('')

omega class gsts have dehydroascorbate reductase and thioltransferase activities and also catalyze the reduction of monomethylarsonate, an intermediate in the pathway of arsenic biotransformation

['monomethylarsonate', 'arsenic']

[-1 -1 -1  0 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1 -1  1]

['includes_genetic_words', 'includes_functional_group']

['includes_oxidation_words', 'includes_reaction_words', 'includes_catalyze_words', 'includes_reaction_words2']

n-acetylglucosamine-1-phosphodiester alpha-n-acetylglucosaminidase ec 31445 phosphodiester alpha-glcnacase catalyzes the second step in the synthesis of the mannose 6-phosphate determinant required for efficient intracellular targeting of newly synthesized lysosomal hydrolases to the lysosome

['n-acetylglucosamine-1-phosphodiester', 'phosphodiester']

[-1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
  1 -1 -1 

In [207]:

df_true[df_true["label_voter"] == -1]

df_test['label_voter'] = df_test['label_voter'].replace(-1, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['label_voter'] = df_true['label_voter'].replace(-1, 1)


In [213]:
df_test[df_test["label_voter"] == 1]

Unnamed: 0,sentence,chemicals,truth,substrates,products,text,label_voter
0,the enzyme cyclo-oxygenase catalyses the oxyge...,"[arachidonic acid, prostaglandins]",1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...,1.0
1,although the exact site of decarboxylation of ...,"[levodopa, dopamine]",1,,,[although the exact site of decarboxylation of...,1.0
2,the enzyme dimethylarginine dimethylaminohydro...,"[citrulline, arginine, methylamines, dimethyla...",1,arginine,,[the enzyme dimethylarginine dimethylaminohydr...,1.0
3,background and aims glutamic acid decarboxylas...,"[gaba, glutamate]",1,glutamate,"gamma-aminobutyric, acid, GABA",[background and aims glutamic acid decarboxyla...,1.0
4,"mazindane 26 was found to be a pro-drug, oxidi...","[mazindol, mazindane, 5-oh, 5-h]",1,,,"[mazindane 26 was found to be a pro-drug, oxid...",1.0
5,"interestingly, the allele of pro1 was shown to...","[l-glutamate, l-proline, gamma-glutamyl]",1,L-glutamate,L-proline,"[interestingly, the allele of pro1 was shown t...",1.0
6,in addition to the racemization it also cataly...,"[pyruvate, l-ser]",1,,,[in addition to the racemization it also catal...,1.0
7,we concluded that fdh has no direct role in th...,"[ser, formate, co(2)]",1,formate,CO,[we concluded that fdh has no direct role in t...,1.0
8,the ratio between the gdc/shmt and c1-thf synt...,"[ser, formate, gly]",1,"[, [",Ser,[the ratio between the gdc/shmt and c1-thf syn...,1.0
9,one of the enzymes responsible for the product...,"[ka, kynurenine, oxoglutaramic acid, glutamine]",1,glutamine,"KA, oxoglutaramic, acid",[one of the enzymes responsible for the produc...,1.0


In [198]:
from sklearn.metrics import accuracy_score

print("accuracy:", accuracy_score(df_test["truth"], df_test["label_voter"]))


df_test[df_test["truth"] == 0, df_test["label_voter"] == 1]





accuracy: 0.7581329561527581


TypeError: '(0       False
1       False
2       False
3       False
4       False
        ...  
1409     True
1410     True
1411     True
1412     True
1413     True
Name: truth, Length: 1414, dtype: bool, 0        True
1       False
2       False
3        True
4        True
        ...  
1409    False
1410    False
1411     True
1412    False
1413    False
Name: label_voter, Length: 1414, dtype: bool)' is an invalid key

In [109]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123, class_balance=[0.95, 0.05])
df_test["label_model"] = label_model.predict(L=L_test)
df_test["label_model_precent"] = label_model.predict_proba(L=L_test)[:,1]

NameError: name 'L_train' is not defined

In [110]:
def grab_second(probs):
    return (probs[1])

In [111]:
# df_train["label_probs"] = np.apply_along_axis(grab_second, 1, majority_model.predict_proba(L=L_train))
# df_train["label_probs"].value_counts()

In [112]:
df_test.to_csv(r'../labeled.csv')

In [210]:
df_test["truth"].value_counts()

0    1361
1      53
Name: truth, dtype: int64

In [211]:
only_truth_df = df_test.dropna(subset=['truth'])
only_truth_df

Unnamed: 0,sentence,chemicals,truth,substrates,products,text,label_voter
0,the enzyme cyclo-oxygenase catalyses the oxyge...,"[arachidonic acid, prostaglandins]",1,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...,1.0
1,although the exact site of decarboxylation of ...,"[levodopa, dopamine]",1,,,[although the exact site of decarboxylation of...,1.0
2,the enzyme dimethylarginine dimethylaminohydro...,"[citrulline, arginine, methylamines, dimethyla...",1,arginine,,[the enzyme dimethylarginine dimethylaminohydr...,1.0
3,background and aims glutamic acid decarboxylas...,"[gaba, glutamate]",1,glutamate,"gamma-aminobutyric, acid, GABA",[background and aims glutamic acid decarboxyla...,1.0
4,"mazindane 26 was found to be a pro-drug, oxidi...","[mazindol, mazindane, 5-oh, 5-h]",1,,,"[mazindane 26 was found to be a pro-drug, oxid...",1.0
...,...,...,...,...,...,...,...
1409,water uptake of the polymer was only 28 and 02...,[pec],0,,,[water uptake of the polymer was only 28 and 0...,
1410,degradation of less hydrophilic pec41 with hig...,[],0,,,[degradation of less hydrophilic pec41 with hi...,
1411,"by this mechanism, ce-responsive drug in vitro...",[pec],0,,,"[by this mechanism, ce-responsive drug in vitr...",
1412,"as expected, less bovine serum albumin bsa was...",[],0,,,"[as expected, less bovine serum albumin bsa wa...",


In [182]:
import sklearn as sk
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score


In [212]:
no_abstain_df = only_truth_df[only_truth_df["label_voter"] != -1]
print("f1:", sk.metrics.f1_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
print("recall:", recall_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
print("precision:", precision_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
print("accuracy:", accuracy_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
print("balanced accuracy:", balanced_accuracy_score(no_abstain_df["truth"], no_abstain_df["label_voter"]))
tn, fp, fn, tp = confusion_matrix(no_abstain_df["truth"], no_abstain_df["label_voter"]).ravel()
print("matrix:", (tn, fp, fn, tp))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [117]:
'''
f1: 0.38317757009345793
recall: 0.7454545454545455
precision: 0.2578616352201258
accuracy: 0.8980694980694981
balanced accuracy: 0.8251466275659824
matrix: (1122, 118, 14, 41)
*with out js lfs*
f1: 0.4833333333333333
recall: 0.5576923076923077
precision: 0.4264705882352941
accuracy: 0.9533483822422875
balanced accuracy: 0.763575989398229
matrix: (1238, 39, 23, 29)
'''

'\nf1: 0.38317757009345793\nrecall: 0.7454545454545455\nprecision: 0.2578616352201258\naccuracy: 0.8980694980694981\nbalanced accuracy: 0.8251466275659824\nmatrix: (1122, 118, 14, 41)\nwith out js\nf1: 0.4833333333333333\nrecall: 0.5576923076923077\nprecision: 0.4264705882352941\naccuracy: 0.9533483822422875\nbalanced accuracy: 0.763575989398229\nmatrix: (1238, 39, 23, 29)\n'

In [118]:
no_abstain_df = only_truth_df[only_truth_df["label_model"] != -1]
print("f1:", sk.metrics.f1_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
print("recall:", recall_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
print("precision:", precision_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
print("accuracy:", accuracy_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
print("balanced accuracy:", balanced_accuracy_score(no_abstain_df["truth"], no_abstain_df["label_model"]))
tn, fp, fn, tp = confusion_matrix(no_abstain_df["truth"], no_abstain_df["label_model"]).ravel()
print("matrix:", (tn, fp, fn, tp))

f1: 0.1836734693877551
recall: 0.15
precision: 0.23684210526315788
accuracy: 0.9434229137199435
balanced accuracy: 0.5642909896602658
matrix: (1325, 29, 51, 9)


In [141]:
'''
f1: 0.21951219512195122
recall: 0.15
precision: 0.4090909090909091
accuracy: 0.9547383309759547
balanced accuracy: 0.5701994091580502
matrix: (1341, 13, 51, 9)
*without js lfs*
f1: 0.14925373134328357
recall: 0.08333333333333333
precision: 0.7142857142857143
accuracy: 0.9596888260254597
balanced accuracy: 0.5409281142294436
matrix: (1352, 2, 55, 5)
'''

'\nf1: 0.14925373134328357\nrecall: 0.08333333333333333\nprecision: 0.7142857142857143\naccuracy: 0.9596888260254597\nbalanced accuracy: 0.5409281142294436\nmatrix: (1352, 2, 55, 5)\n'