## iGEM labeling functions

## imports

In [2]:
import pandas as pd
import numpy as np
import os
import nltk
import regex as re
import ast
from collections import defaultdict
from snorkel.labeling import labeling_function

### setting up abstain

In [159]:
ABSTAIN = -1

In [160]:
all_lfs = []

### labeling functions + small tests written

In [161]:
# includes_solution_words
# If any of the words is in a list of solution terms, we label FALSE
# https://www.hach.com/chemGlossary   
solution_terms = ["buffer", "diluent", "solute", "solvent", "saturated", "unsaturated", "saturating",
                    "saturable"]
@labeling_function()
def includes_solution_words(x):
    for word in solution_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_solution_words)

In [162]:
# includes_mixture_words
# If any of the words is in a list of mixture terms, we label FALSE
# https://www.hach.com/chemGlossary   
mixture_terms = ["suspended", "mixture", "heterogenous", "homogeneous"]
@labeling_function()
def includes_mixture_words(x):
    for word in mixture_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_mixture_words)

In [163]:
# includes_physical_words
# If there are common physical terms, we label FALSE
physical_terms = ["detection", "composed", "weight", "characteristic", "metal", "express",
    "characterization", "color", "metalic", "consists", "pure", "compose", "assay", "mm", "bound",
    "permeable", "signal", "bind", "property", "stored", "released", "capacity", "resistance", "mol"]
@labeling_function()
def includes_physical_words(x):
    for word in physical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_physical_words)

In [164]:
# includes_genetic_words
# If one of the chemicals is in a list of genetic terms such as reductase, dna, gene, we label FALSE
genetic_terms = ["reductase", "dna", "gene", "allele", "locus", "genotype", "phenotype", "dominant", "recessive", "additive", "phenoset",
    "diallelic", "multiallelic", "polyallelic", "monomorphic", "monoallelic", "polymorphism", "mutation", "complex", "trait", "multifactorial",
    "polygenic", "monogenic", "mixed model", "transmission probability", "transition probability", "epistasis", "interaction", "pleiotropy",
    "quantitative trait locus", "probit", "logit", "penetrance", "transformation", "scale of measurement", "identity by descent", "identity in state",
    "haplotype", "phase", "multilocus", "genotype", "allelic", "association", "linkage", "disequilibrium", "gametic", "rna"]
@labeling_function()
def includes_genetic_words(x):
    for word in genetic_terms:
        if (word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_genetic_words)
    

In [165]:
# includes_structural_words
# If there are common structural terms, we label FALSE
structural_terms = ["loop", "sequence", "encodes", "code", "codon", "dna", "rna", "pair", "group", "active site",
    "bond", "chain", "gene", "structure", "structural", "encoding", "cdna", "cluster"]
@labeling_function()
def includes_structural_words(x):
    for word in structural_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_structural_words)

In [166]:
# includes_general_chemical_words
# If any of the words is in a list of common chemical terms such as amino acid, sugar, adenosine, amide, adenine, etc..., we label FALSE
# https://www.hach.com/chemGlossary   
chemical_terms = ["amino acid", "sugar", "adenosine", "amide", "adenine", "algaecide", "amines", "base", "biocides",
    "clarifier", "hydrocarbon", "hydrogenation", "molecule", "nutrients", "polymer", "peptide", "polypeptide",
    "tag", "functional", "activity", "electron", "cofactor", "gas"]
@labeling_function()
def includes_general_chemical_words(x):
    for word in chemical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_general_chemical_words)

In [167]:
# includes_functional_group
# If there is a close mention of a functional chemical group, we label FALSE (as it is more likely descriptive of a structure than of a reaction)
# https://www.masterorganicchemistry.com/2010/10/06/functional-groups-organic-chemistry/
common_functional_groups = ["alkane", "alkene", "alkyne", "benzene ring", "phenyl", "amine", "alcohol", "ether", "alkyl halide", "thiol",
    "aldehyde", "ketone", "ester", "carboxylic acid", "amide", "nitrile", "epoxide", "disulfide", "imine", "acid chloride", "anhydride", "nitro",
    "sulfide", "thioether", "group", "functional"]
@labeling_function()
def includes_functional_group(x):
    for group in common_functional_groups:
        if (group in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_functional_group)


In [168]:
# includes_amino_acid
# If there is an amino acid mentioned, we label FALSE
amino_acids = ['val', 'ile', 'leu', 'glu', 'gln', \
    'asp', 'asn' 'his' 'trp', 'phe', 'tyr',    \
    'arg', 'lys', 'ser', 'thr', 'met', 'ala',    \
    'gly', 'pro', 'cys', "amino"]
@labeling_function()
def includes_amino_acid(x):
    for group in amino_acids:
        if (group in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_amino_acid)


In [169]:
# includes_paper_artifacts
# If there are common words from paper headers/footers, we label FALSE
common_terms = ["university", "univ", "pharma", "avenue", "street", "road", "department", "usa", "reference", "ref",
    "keyword", "article", "http", "png", "jpg"]
@labeling_function()
def includes_paper_artifacts(x):
    for word in common_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_paper_artifacts)

In [170]:
# includes_no_terms
# If there are words stopping or with no change, we label FALSE
common_no_terms = ["inactivated", "unaffected", "inactive", "inactivates", "stops", "prevent", "inhibit",
    "denature", "block"]
@labeling_function()
def includes_no_terms(x):
    for word in common_no_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

all_lfs.append(includes_no_terms)

In [171]:
def helper_sep_chems_with_or(chemicals):
    final = ""
    for chem in chemicals:
        if (final == ""):
            final += chem
        else:
            final += "|" + chem
    return final

In [172]:
# structure_adjacent_mentions
# If the chemicals are adjacent, we label FALSE
@labeling_function()
def structure_adjacent_mentions(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_adjacent_mentions)

In [173]:
# structure_sep_or
# If the chemicals are separated by or, we label FALSE
@labeling_function()
def structure_sep_or(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") or (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_or)

In [174]:
# structure_sep_and
# If the chemicals are separated by and, we label FALSE
@labeling_function()
def structure_sep_and(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") and (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_and)

In [175]:
# structure_sep_comma
# If the chemicals are separated by a comma, we label FALSE
@labeling_function()
def structure_sep_comma(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + "), (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_comma)

In [176]:
# LF_sep_via
# If the chemicals are separated by via, in, etc, we label FALSE
via_terms = "via|by|in"
@labeling_function()
def structure_sep_via(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ") (" + via_terms + ") (" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_via)

In [177]:
# structure_sep_sym
# If the chemicals are separated by a single character, we label FALSE  
@labeling_function()
def structure_sep_sym(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + ").(" + chemicals + ")"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_sep_via)

In [178]:
# structure_followed_by_ase
# If one of the chemicals is followed by a word that ends with -ase, we label FALSE
@labeling_function()
def structure_followed_by_ase(x):
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "(" + chemicals + r") \w*ase\b"
    if (re.search(structure, x[0])):
        return False
    return ABSTAIN

all_lfs.append(structure_followed_by_ase)

In [236]:
# structure_followed_by_noun
# If one of the chemicals is followed by a noun, we label FALSE
@labeling_function()
def structure_followed_by_noun(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    if (chemicals == ""):
        return ABSTAIN
    structure = "(" + chemicals + r") \w+"
    for match in re.finditer(structure, sentence):
        if (nltk.pos_tag([match.group(0).split()[1]])[0][1] == "NN"):
            return False
    return ABSTAIN

all_lfs.append(structure_followed_by_noun)

In [232]:
# structure_sep_verb
# If the chemicals are separated by a verb, we label TRUE
@labeling_function()
def structure_sep_verb(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    if (chemicals == ""):
        return ABSTAIN
    structure = "(" + chemicals + r") \w+ (" + chemicals + ")"
    for match in re.finditer(structure, sentence):
        if (re.match(r"(\bVB|NNS)", nltk.pos_tag([match.group(0).split()[1]])[0][1])):
            return True
    return ABSTAIN

all_lfs.append(structure_sep_verb)

In [233]:
# structure_sep_adverb
# If the chemicals are separated by a adverb, we label FALSE
@labeling_function()
def structure_sep_adverb(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    if (chemicals == ""):
        return ABSTAIN
    structure = "(" + chemicals + r") \w+ (" + chemicals + ")"
    for match in re.finditer(structure, sentence):
        if (re.match(r"(\bRB)", nltk.pos_tag([match.group(0).split()[1]])[0][1])):
            return False
    return ABSTAIN

all_lfs.append(structure_sep_adverb)

In [234]:
structure_sep_adverb(["cassie crazily night away", ["cassie", "night"]])

False

In [99]:
# includes_oxidation_words
# If the sentence contains oxidation words, we label True
oxidation_terms = "oxidiz|oxidis|redox|reduc|rust|corrod"
@labeling_function()
def includes_oxidation_words(x):
    structure = "(" + oxidation_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_oxidation_words)

In [100]:
# includes_combustion_words
# If the sentence contains combustion words, we label True
combustion_terms = "combust|burn|explod|gas-form"
@labeling_function()
def includes_combustion_words(x):
    structure = "(" + combustion_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_combustion_words)

In [101]:
# includes_neutralization_words
# If the sentence contains neutralization words, we label True
neutralization_terms = "neutraliz|titrat|buffer|gas-form"
@labeling_function()
def includes_neutralization_words(x):
    structure = "(" + neutralization_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_neutralization_words)

In [35]:
# LF_includes_catalyze_words
# If the sentence contains catalyze words, we label True
# catalyze_words = ["catalyze", "catalyst", "catalyse", "catalysing", "catalyzing"]
# @labeling_function()
# def includes_catalyze_words(x):
#     for word in catalyze_words:
#         if(word in x[0]):
#             return True
#     return ABSTAIN

In [102]:
# includes_combination_words
# If the sentence contains combination words, we label True
combination_terms = "combin"
@labeling_function()
def includes_combination_words(x):
    structure = "(" + combination_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_combination_words)

In [103]:
# includes_decomposition_words
# If the sentence contains decomposition words, we label True
decomposition_terms = "decompos"
@labeling_function()
def includes_decomposition_words(x):
    structure = "(" + decomposition_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_decomposition_words)

In [104]:
# includes_replacement_words
# If the sentence contains decomposition words, we label True
replacement_terms = "replac"
@labeling_function()
def includes_replacement_words(x):
    structure = "(" + replacement_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_replacement_words)

In [105]:
# includes_reaction_words
# If the sentence contains reactions words, we label True
reaction_terms = "conver|yield|produc|mak|creat|synthesiz|synthesis|transform|ferment|break|displac|exchang" + \
                 "precipit|transfer|through|produc|activat|revers|form|ation|metaboliz|metabolis|generat|hydroly"
@labeling_function()
def includes_reaction_words(x):
    structure = "(" + reaction_terms + ")"
    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

all_lfs.append(includes_reaction_words)

In [106]:
# includes_react
# If the sentence contains react, we label True
@labeling_function()
def includes_react(x):
    if("react" in x[0]):
        return True
    return ABSTAIN

all_lfs.append(includes_react)

In [107]:
# includes_reaction_component_words
# If the sentence contains reaction components words, we label True
reaction_component_terms = ["substrate", "product", "reactant", "step"]
@labeling_function()
def includes_reaction_component_words(x):
    for word in reaction_component_terms:
        if(word in x[0]):
            return True
    return ABSTAIN

all_lfs.append(includes_reaction_component_words)

In [108]:
# includes_comparison_words
# If the sentence contains comparison words, we label False
comparison_terms = ["similar", "more", "greater", "less", "increase", "decrease", "compare", "comparing",
    "difference", "differ", "relative", "better", "times", "than"]
@labeling_function()
def includes_comparison_words(x):
    for word in comparison_terms:
        if(word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_comparison_words)

In [109]:
# includes_concentration
# If the sentence contains react, we label True
@labeling_function()
def includes_concentration(x):
    if("concentration" in x[0]):
        return True
    return ABSTAIN

all_lfs.append(includes_concentration)

In [110]:
# includes_measure_words
# If the sentence contains measure words, we label False
measure_terms = ["high", "low", "ph", "stability", "corelated", "more", "less", "level", "degree", "time",
                "measure"]
@labeling_function()
def includes_measure_words(x):
    for word in measure_terms:
        if(word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_measure_words)

In [111]:
# includes_experiment_words
# If the sentence contains experiement words, we label False
experiment_terms = ["mice", "cell", "mouse", "ovary", "male", "female", "animal", "study", "method",
    "test", "treat", "protection", "brain", "nerve", "human", "tissue", "fetal", "vitro", "studies",
    "membrane", "strain", "mutant", "regulate", "dependent", "drug", "therapy", "oral", "test", "autoantigen"]
@labeling_function()
def includes_experiment_words(x):
    for word in experiment_terms:
        if(word in x[0]):
            return False
    return ABSTAIN

all_lfs.append(includes_experiment_words)

In [114]:
# structure_sep_converstion_words
# If the sentence contains to, from, into, etc., we label True
conversion_terms = "to|from|into|becom|became"
@labeling_function()
def structure_sep_converstion_words(x):
    sentence = x[0].replace(',', '')
    chemicals = helper_sep_chems_with_or(x[1])
    structure = "((" + chemicals + r") (" + conversion_terms + r")\b|((" + conversion_terms + r")\b (" + chemicals + ")))"
    if (re.search(structure, sentence)):
            return True
    return ABSTAIN

all_lfs.append(structure_sep_converstion_words)

In [118]:
# includes_one_chem
# If the sentence contains one identified chemical, we label False
@labeling_function()
def includes_one_chem(x):
    if(len(x[1]) == 1):
            return False
    return ABSTAIN

all_lfs.append(includes_one_chem)

### importing data in and cleaning it (this data has truth values)

In [129]:
# have this csv file in the same folder
sentence_df = pd.read_csv("sentence_data_cleaned_csv_fixed.csv")

In [130]:
sentence_chem_df = sentence_df[["sentence", "chemicals", "truth", "substrates", "products"]]
sentence_chem_df.head()

Unnamed: 0,sentence,chemicals,truth,substrates,products
0,The enzyme cyclo-oxygenase catalyses the oxyge...,prostaglandins,1.0,"arachidonic, acid",prostaglandins
1,Recently two forms of cyclo-oxygenase have bee...,,0.0,,
2,Constitutive and inducible forms of human cycl...,,0.0,,
3,hCOX-1 had a specific activity of 18.8 mumol o...,arachidonate,0.0,arachidonate,
4,"of 1500 nmol of O2/nmol of enzyme, whereas hCO...",arachidonate,0.0,arachidonate,


In [131]:
greek_alphabet = {
    u'\u0393': 'Gamma',
    u'\u0394': 'Delta',
    u'\u0398': 'Theta',
    u'\u039B': 'Lamda',
    u'\u039E': 'Xi',
    u'\u03A0': 'Pi',
    u'\u03A3': 'Sigma',
    u'\u03A6': 'Phi',
    u'\u03A7': 'Chi',
    u'\u03A8': 'Psi',
    u'\u03A9': 'Omega',
    u'\u03B1': 'alpha',
    u'\u03B2': 'beta',
    u'\u03B3': 'gamma',
    u'\u03B4': 'delta',
    u'\u03B5': 'epsilon',
    u'\u03B6': 'zeta',
    u'\u03B7': 'eta',
    u'\u03B8': 'theta',
    u'\u03B9': 'iota',
    u'\u03BA': 'kappa',
    u'\u03BB': 'lamda',
    u'\u03BC': 'mu',
    u'\u03BD': 'nu',
    u'\u03BE': 'xi',
    u'\u03BF': 'omicron',
    u'\u03C0': 'pi',
    u'\u03C1': 'rho',
    u'\u03C3': 'sigma',
    u'\u03C4': 'tau',
    u'\u03C5': 'upsilon',
    u'\u03C6': 'phi',
    u'\u03C7': 'chi',
    u'\u03C8': 'psi',
    u'\u03C9': 'omega'
}

In [132]:
def remove_greek(words):
    for letter in greek_alphabet.keys():
        words = words.replace(letter, greek_alphabet[letter])
    return words

In [133]:
import math
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].replace(np.NaN, "0")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].replace(np.NaN, "0")


In [134]:
def chem_into_array(chemicals):
    if (chemicals == "0"):
        return []
    chemicals = remove_greek(chemicals)
    chemicals = chemicals.lower()
    sample_chems = chemicals.replace("%20", " ")
    sample_chems_list = sample_chems.split(", ")
    for index in range(len(sample_chems_list)):
        while(sample_chems_list[index][-1] == ","):
            sample_chems_list[index] = sample_chems_list[index][:-1]
    sample_chems_list = list(set(sample_chems_list))
    return sample_chems_list

In [135]:
def sentence_cleaned(sentence):
    sentence = remove_greek(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace(".", "")
    sentence = sentence.replace(")", "")
    sentence = sentence.replace("(", "")
    sentence = sentence.replace(";", "")
    sentence = sentence.replace(":", "")
    return sentence

In [136]:
sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)
sentence_chem_df["truth"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)


0.0    342
1.0     18
Name: truth, dtype: int64

In [137]:
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)


In [138]:
sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()


Unnamed: 0,sentence,chemicals,truth,substrates,products,text
0,the enzyme cyclo-oxygenase catalyses the oxyge...,[prostaglandins],1.0,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...
1,recently two forms of cyclo-oxygenase have bee...,[],0.0,,,[recently two forms of cyclo-oxygenase have be...
2,constitutive and inducible forms of human cycl...,[],0.0,,,[constitutive and inducible forms of human cyc...
3,hcox-1 had a specific activity of 188 mumol of...,[arachidonate],0.0,arachidonate,,[hcox-1 had a specific activity of 188 mumol o...
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",[arachidonate],0.0,arachidonate,,"[of 1500 nmol of o2/nmol of enzyme, whereas hc..."
...,...,...,...,...,...,...
1409,water uptake of the polymer was only 28 and 02...,[pec],0.0,,,[water uptake of the polymer was only 28 and 0...
1410,degradation of less hydrophilic pec41 with hig...,[],0.0,,,[degradation of less hydrophilic pec41 with hi...
1411,"by this mechanism, ce-responsive drug in vitro...",[pec],0.0,,,"[by this mechanism, ce-responsive drug in vitr..."
1412,"as expected, less bovine serum albumin bsa was...",[],0.0,,,"[as expected, less bovine serum albumin bsa wa..."


### data with no truth values

In [139]:
# have this data one folder up
uncleaned_df = pd.read_csv("../sentence_annotations_elsevier_pmid_split6.csv")

In [68]:
uncleaned_no_na_df = uncleaned_df.dropna()
uncleaned_several_chem_df = uncleaned_no_na_df.loc[uncleaned_no_na_df["chemical_names"].str.contains(",")]
uncleaned_several_chem_df = uncleaned_several_chem_df[["sentence", "chemical_names"]]

In [69]:
uncleaned_several_chem_df["sentence"] = uncleaned_several_chem_df["sentence"].apply(sentence_cleaned)
uncleaned_several_chem_df["chemical_names"] = uncleaned_several_chem_df["chemical_names"].apply(chem_into_array)

In [71]:
# need to get rid of this line if this data should be used
re_check_several_chem_uncleaned_df = uncleaned_several_chem_df.loc[uncleaned_several_chem_df["chemical_names"].str.len() > 1]

In [73]:
re_check_several_chem_uncleaned_df["text"] = re_check_several_chem_uncleaned_df[["sentence", "chemical_names"]].values.tolist()
re_check_several_chem_uncleaned_df.iloc[47]["text"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  re_check_several_chem_uncleaned_df["text"] = re_check_several_chem_uncleaned_df[["sentence", "chemical_names"]].values.tolist()


['univ sydney, dept pharm, sydney, nsw 2006, australia abstract two distinct mg2+-atpase activities were isolated from triton x-100-solubilized human erythrocyte membranes using a combination of calmodulin-agarose to remove ca2+-atpase and ion exchange chromatography to separate the mg2+-atpase activities',
 ['ca2+', 'mg2+']]

### snorkel code for model (running on data with truth values)

In [141]:
from snorkel.labeling import PandasLFApplier

In [142]:
sentence_chem_df

Unnamed: 0,sentence,chemicals,truth,substrates,products,text
0,the enzyme cyclo-oxygenase catalyses the oxyge...,[prostaglandins],1.0,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...
1,recently two forms of cyclo-oxygenase have bee...,[],0.0,,,[recently two forms of cyclo-oxygenase have be...
2,constitutive and inducible forms of human cycl...,[],0.0,,,[constitutive and inducible forms of human cyc...
3,hcox-1 had a specific activity of 188 mumol of...,[arachidonate],0.0,arachidonate,,[hcox-1 had a specific activity of 188 mumol o...
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",[arachidonate],0.0,arachidonate,,"[of 1500 nmol of o2/nmol of enzyme, whereas hc..."
...,...,...,...,...,...,...
1409,water uptake of the polymer was only 28 and 02...,[pec],0.0,,,[water uptake of the polymer was only 28 and 0...
1410,degradation of less hydrophilic pec41 with hig...,[],0.0,,,[degradation of less hydrophilic pec41 with hi...
1411,"by this mechanism, ce-responsive drug in vitro...",[pec],0.0,,,"[by this mechanism, ce-responsive drug in vitr..."
1412,"as expected, less bovine serum albumin bsa was...",[],0.0,,,"[as expected, less bovine serum albumin bsa wa..."


In [143]:
# how to find the location
sentence_chem_df[sentence_chem_df["sentence"] == "even though the activities of mat and gnmt were elevated, the concentration of liver s-adenosylmethionine was decreased 24%, p<0001 and s-adenosylhomocysteine increased 113%, p<0001 in the dwarf mice"]

Unnamed: 0,sentence,chemicals,truth,substrates,products,text
318,even though the activities of mat and gnmt wer...,"[s-adenosylhomocysteine, s-adenosylmethionine]",,S-adenosylmethionine,S-adenosylhomocysteine,[even though the activities of mat and gnmt we...


In [258]:
# Define the set of labeling functions (LFs)
# currently excluding amino_acid and followed_ase and followed_by_noun
lfs = [includes_solution_words, includes_mixture_words, includes_physical_words, includes_genetic_words, includes_structural_words,
      includes_general_chemical_words, includes_functional_group, includes_paper_artifacts, includes_no_terms, structure_adjacent_mentions,
      structure_sep_or, structure_sep_comma, structure_sep_via, structure_sep_sym, structure_sep_adverb, includes_oxidation_words,
      structure_sep_verb, structure_sep_converstion_words, includes_combustion_words, includes_neutralization_words, includes_combination_words,
      includes_decomposition_words, includes_replacement_words, includes_reaction_words, includes_reaction_component_words, includes_comparison_words,
      includes_one_chem, includes_react, includes_measure_words, includes_experiment_words, includes_concentration, structure_sep_and]
      # includes_amino_acid, structure_followed_by_ase, structure_followed_by_noun]

# removing physical_words increases recall but causes large drop in precision
# sep_conversion_word and sep_verb removal increase precision to 0.71 with recall at 0.38
# Apply the LFs to the unlabeled training data
df_train = sentence_chem_df
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 1414/1414 [00:02<00:00, 617.11it/s]


In [259]:
L_train[1]

array([-1, -1,  0, -1, -1, -1, -1, -1, -1,  0, -1,  0,  0,  0, -1, -1, -1,
        1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  0, -1,  0])

In [260]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
includes_solution_words,0,[0],0.012023,0.012023,0.010608
includes_mixture_words,1,[0],0.001414,0.001414,0.001414
includes_physical_words,2,[0],0.304809,0.304809,0.193777
includes_genetic_words,3,[0],0.212871,0.212871,0.153465
includes_structural_words,4,[0],0.194484,0.194484,0.137907
includes_general_chemical_words,5,[0],0.251061,0.250354,0.166902
includes_functional_group,6,[0],0.173975,0.173975,0.127298
includes_paper_artifacts,7,[0],0.041726,0.041726,0.024045
includes_no_terms,8,[0],0.192362,0.191655,0.125884
structure_adjacent_mentions,9,[0],0.39604,0.39604,0.235502


In [261]:
LFAnalysis(L_train).label_coverage()

0.9985855728429985

In [262]:
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling.model import LabelModel

In [263]:
majority_model = MajorityLabelVoter()
df_train["label"] = majority_model.predict(L=L_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["label"] = majority_model.predict(L=L_train)


In [264]:
df_train["label"].value_counts()

 0    1295
-1      76
 1      43
Name: label, dtype: int64

In [265]:
def grab_second(probs):
    return (probs[1])

In [277]:
df_train["label_probs"] = np.apply_along_axis(grab_second, 1, majority_model.predict_proba(L=L_train))
df_train["label_probs"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["label_probs"] = np.apply_along_axis(grab_second, 1, majority_model.predict_proba(L=L_train))


0.0    1295
0.5      76
1.0      43
Name: label_probs, dtype: int64

In [267]:
df_train.to_csv(r'../labeled.csv')

In [268]:
df_train["truth"].value_counts()

0.0    342
1.0     18
Name: truth, dtype: int64

In [269]:
only_truth_df = df_train.dropna(subset=['truth'])
only_truth_df

Unnamed: 0,sentence,chemicals,truth,substrates,products,text,label,label_probs
0,the enzyme cyclo-oxygenase catalyses the oxyge...,[prostaglandins],1.0,"arachidonic, acid",prostaglandins,[the enzyme cyclo-oxygenase catalyses the oxyg...,0,0.0
1,recently two forms of cyclo-oxygenase have bee...,[],0.0,,,[recently two forms of cyclo-oxygenase have be...,0,0.0
2,constitutive and inducible forms of human cycl...,[],0.0,,,[constitutive and inducible forms of human cyc...,0,0.0
3,hcox-1 had a specific activity of 188 mumol of...,[arachidonate],0.0,arachidonate,,[hcox-1 had a specific activity of 188 mumol o...,0,0.0
4,"of 1500 nmol of o2/nmol of enzyme, whereas hco...",[arachidonate],0.0,arachidonate,,"[of 1500 nmol of o2/nmol of enzyme, whereas hc...",0,0.0
...,...,...,...,...,...,...,...,...
1409,water uptake of the polymer was only 28 and 02...,[pec],0.0,,,[water uptake of the polymer was only 28 and 0...,0,0.0
1410,degradation of less hydrophilic pec41 with hig...,[],0.0,,,[degradation of less hydrophilic pec41 with hi...,0,0.0
1411,"by this mechanism, ce-responsive drug in vitro...",[pec],0.0,,,"[by this mechanism, ce-responsive drug in vitr...",-1,0.5
1412,"as expected, less bovine serum albumin bsa was...",[],0.0,,,"[as expected, less bovine serum albumin bsa wa...",0,0.0


In [270]:
import sklearn as sk
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score


In [271]:
no_abstain_df = only_truth_df[only_truth_df["label"] != -1]
sk.metrics.f1_score(no_abstain_df["truth"], no_abstain_df["label"])

0.7000000000000001

In [272]:
recall_score(no_abstain_df["truth"], no_abstain_df["label"])

0.6363636363636364

In [273]:
precision_score(no_abstain_df["truth"], no_abstain_df["label"])

0.7777777777777778

In [274]:
accuracy_score(no_abstain_df["truth"], no_abstain_df["label"])

0.9821428571428571

In [275]:
balanced_accuracy_score(no_abstain_df["truth"], no_abstain_df["label"])

0.8151048951048951

In [276]:
tn, fp, fn, tp = confusion_matrix(no_abstain_df["truth"], no_abstain_df["label"]).ravel()
(tn, fp, fn, tp)

(323, 2, 4, 7)

In [257]:
df_train[df_train["label"] != -1]["truth"].value_counts()

0.0    338
1.0     12
Name: truth, dtype: int64

In [140]:
df_train[df_train["label"] != -1]["label"].value_counts()

0    1063
1     122
Name: label, dtype: int64

In [141]:
matched_df = df_train[df_train["label"] == df_train["truth"]]
matched_df["label"].value_counts()

0    262
1     14
Name: label, dtype: int64