## iGEM labeling functions

## example

In [3]:
import pandas as pd
import numpy as np
import os
import nltk
import regex as re
import ast
from collections import defaultdict
from snorkel.labeling import labeling_function

### test data written by hand

In [23]:
test_data = [
    [["carbon", "oxygen"], "carbon was oxidized by the oxygen"],
    [["carbon", "oxygen", "amino acid", "cassie"], "the cassie amino acid is connected to the carbon which was oxidixed by the oxygen inside the dna"],
    
    [["L-GalDH"], "The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity."],
    [["L-GalDH"], "Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy."],
    [["L-GalL dehydrogenase"], "We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000)."],
    [["L-GalDH", "dehydro-AsA", "L-GalL"], "The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL."]
]

### setting up abstain

In [24]:
ABSTAIN = -1

### labeling functions + small tests written

In [154]:
# LF_solution_words
# If any of the words is in a list of solution terms, we label FALSE
# https://www.hach.com/chemGlossary   
solution_terms = ["buffer", "diluent", "solute", "solvent", "saturated", "unsaturated"]
@labeling_function()
def solution_words(x):
    for word in solution_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [155]:
# LF_mixture_words
# If any of the words is in a list of mixture terms, we label FALSE
# https://www.hach.com/chemGlossary   
mixture_terms = ["suspended", "mixture", "heterogenous", "homogeneous"]
@labeling_function()
def mixture_words(x):
    for word in mixture_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [195]:
# LF_physical_words
# If there are common physical terms, we label FALSE
physical_terms = ["purified", "concentration", "detection", "composed", "weight", "characteristic", "metal",
    "characterization", "color", "metalic", "characterized", "character", "consists", "pure", "compose", "assay"]
@labeling_function()
def physical_words(x):
    for word in physical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [157]:
# LF_gene_words
# If one of the chemicals is in a list of genetic terms such as reductase, dna, gene, we label FALSE
# 
genetic_terms = ["reductase", "dna", "gene", "allele", "locus", "genotype", "phenotype", "dominant", "recessive", "additive", "phenoset",
    "diallelic", "multiallelic", "polyallelic", "monomorphic", "monoallelic", "polymorphism", "mutation", "complex", "trait", "multifactorial",
    "polygenic", "monogenic", "mixed model", "transmission probability", "transition probability", "epistasis", "interaction", "pleiotropy",
    "quantitative trait locus", "probit", "logit", "penetrance", "transformation", "scale of measurement", "identity by descent", "identity in state",
    "haplotype", "phase", "multilocus", "genotype", "allelic", "association", "linkage", "disequilibrium", "gametic", "rna"]
@labeling_function()
def gene_words(x):
    for word in genetic_terms:
        if (word in x[0]):
            return False
    return ABSTAIN
    

In [206]:
# LF_structural_words
# If there are common structural terms, we label FALSE
structural_terms = ["loop", "sequence", "encodes", "code", "codon", "dna", "rna", "pair", "group", "active site",
    "bond", "chain", "gene", "structure", "structural", "encoding", "cdna"]
@labeling_function()
def structural_words(x):
    for word in structural_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [159]:
# LF_general_chemical
# If any of the words is in a list of common chemical terms such as amino acid, sugar, adenosine, amide, adenine, etc..., we label FALSE
# https://www.hach.com/chemGlossary   
chemical_terms = ["amino acid", "sugar", "adenosine", "amide", "adenine", "algaecide", "amines", "base", "biocides",
    "clarifier", "hydrocarbon", "hydrogenation", "molecule", "nutrients", "polymer", "peptide", "polypeptide",
    "tag", "functional", "residue", "activity", "enzyme"]
@labeling_function()
def general_chemical(x):
    for word in chemical_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [160]:
# LF_group
# If there is a close mention of a functional chemical group, we label FALSE (as it is more likely descriptive of a structure than of a reaction)
# https://www.masterorganicchemistry.com/2010/10/06/functional-groups-organic-chemistry/
common_functional_groups = ["alkane", "alkene", "alkyne", "benzene ring", "phenyl", "amine", "alcohol", "ether", "alkyl halide", "thiol",
    "aldehyde", "ketone", "ester", "carboxylic acid", "amide", "nitrile", "epoxide", "disulfide", "imine", "acid chloride", "anhydride", "nitro",
    "sulfide", "thioether", "group", "functional"]
@labeling_function()
def group(x):
    for group in common_functional_groups:
        if (group in x[0]):
            return False
    return ABSTAIN


In [161]:
# LF_includes_amino_acids
# If there is a amino acid mentioned, we label FALSE
amino_acids = ['VAL', 'ILE', 'LEU', 'GLU', 'GLN', \
    'ASP', 'ASN' 'HIS' 'TRP', 'PHE', 'TYR',    \
    'ARG', 'LYS', 'SER', 'THR', 'MET', 'ALA',    \
    'GLY', 'PRO', 'CYS', "amino"]
@labeling_function()
def includes_amino_acids(x):
    for group in amino_acids:
        if (group in x[0]):
            return False
    return ABSTAIN


In [162]:
# LF_paper_artifacts
# If there are common words from paper headers/footers, we label FALSE
common_terms = ["university", "univ", "pharma", "avenue", "street", "road", "department", "usa", "reference", "ref",
    "keyword", "article", "http", "png", "jpg"]
@labeling_function()
def paper_artifacts(x):
    for word in common_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [163]:
# LF_no_terms
# If there are words stopping or with no change, we label FALSE
common_no_terms = ["inactivated", "unaffected", "inactive", "inactivates", "stops", "prevent", "inhibit",
    "denature"]
@labeling_function()
def no_terms(x):
    for word in common_no_terms:
        if (word in x[0]):
            return (False)
    return ABSTAIN

In [164]:
# LF_adjacent_mentions
# If the chemicals are adjacent, we label FALSE
@labeling_function()
def adjacent_mentions(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            if ((x[1][index] + " " + x[1][index_2]) in x[0]):
                return False
    return ABSTAIN

In [165]:
# LF_sep_or
# If the chemicals are separated by or, we label FALSE
@labeling_function()
def sep_or(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            if (x[1][index] + " or " + x[1][index_2] in x[0]):
                return False
    return ABSTAIN

In [166]:
# LF_sep_and
# If the chemicals are separated by and, we label FALSE
@labeling_function()
def sep_and(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            if (x[1][index] + " and " + x[1][index_2] in x[0]):
                return False
    return ABSTAIN

In [167]:
# LF_sep_comma
# If the chemicals are separated by a comma, we label FALSE
@labeling_function()
def sep_comma(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            if (x[1][index] + ", " + x[1][index_2] in x[0]):
                return False
    return ABSTAIN

In [168]:
# LF_sep_via
# If the chemicals are separated by via, in, etc, we label FALSE
common_via_words = ["via", "in"]
@labeling_function()
def sep_via(x):
    for index in range(len(x[1])):
        for index_2 in range(len(x[1])):
            for term in common_via_words:
                if (x[1][index] + " " + term + " " + x[1][index_2] in x[0]):
                    return False
    return ABSTAIN

In [169]:
# LF_sep_sym
# If the chemicals are separated by a single character, we label FALSE  
@labeling_function()
def sep_sym(x):
    for index in range(len(x[1])):
        chem_1_index = x[0].find(x[1][index])
        chem_1_len = len(x[1][index])
        while (0 <= chem_1_index < len(x[0])):
            for index_2 in range(len(x[1])):
                chem_2_index = x[0].find(x[1][index_2])
                while (0 <= chem_2_index < len(x[0])):
                    if (sep_sym_helper(chem_1_index, chem_1_len, chem_2_index)):
                        return False
                    chem_2_index = x[0].find(x[1][index_2], chem_2_index + 1)
            chem_1_index = x[0].find(x[1][index], chem_1_index + 1)
    return ABSTAIN

def sep_sym_helper(index_1, length, index_2):
    if (index_1 + length + 1== index_2):
        return True
    return False

In [170]:
# testing separated by a single character (ensuring it works with repeated chemicals)
sep_sym(["cassie is here and her name cassie cas is cas cassie", ["cassie", "cas"]])

False

In [171]:
# LF_followed_ase
# If one of the chemicals is followed by a word that ends with -ase, we label FALSE
@labeling_function()
def followed_ase(x):
    sentence = x[0].replace('.', '')
    sentence = sentence.replace(',', '')
    words = sentence.split(" ")
    for chem in x[1]:
        index = -1
        while (index < len(words)):
            if not(chem in words[index + 1:]):
                index = len(words)
                break
            if (" " in chem):
                chem_split = chem.split(" ")
                index = words.index(chem_split[-1], index + 1)
            else:
                index = words.index(chem, index + 1)
            if (0 < index + 1 < len(words) and words[index + 1][-3:] == "ase"):
                return False
    return ABSTAIN

In [172]:
# test followed_ase (making sure it works if the chem is repeated)
followed_ase(["carbon fiber and carbon lactase with oxygen", ["carbon", "oxygen"]])

False

In [173]:
# LF_followed_by_noun
# If one of the chemicals is followed by a noun, we label FALSE
@labeling_function()
def followed_by_noun(x):
    sentence = x[0].replace(',', '')
    words = sentence.split(" ")
    tagged = nltk.pos_tag(words)
    for chem in x[1]:
        index = -1
        while (index < len(words)):
            if not(chem in words[index + 1:]):
                index = len(words)
                break
            if (" " in chem):
                chem_split = chem.split(" ")
                index = words.index(chem_split[-1], index + 1)
            else:
                index = words.index(chem, index + 1)
            if (index + 1 < len(words)):
                if (tagged[index + 1][1] == "NN"):
                    return False
    return ABSTAIN

In [174]:
# testing followed_by_noun (ensuring it works with repeats)
followed_by_noun(["cassie eating apple, eating for cassie table", ["cassie", "apple"]])

False

In [175]:
# test cases
print(followed_by_noun(["The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity.", ["L-GalDH"]]))
print(followed_by_noun(["Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy.", ["L-GalDH"]]))
print(followed_by_noun(["We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000).", ["L-GalL dehydrogenase"]]))
print(followed_by_noun(["The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL.", ["L-GalDH", "dehydro-AsA", "L-GalL"]]))

False
False
-1
False


In [176]:
# LF_sep_verb
# If the chemicals are separated by a verb, we label TRUE
@labeling_function()
def sep_verb(x):
    sentence = x[0].replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in x[1]:
        index = -1
        while (index < len(words)):
            if (" " in chem):
                chem_split = chem.split(" ")
                if not(chem_split[-1] in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem_split[-1], index + 1)
            else:
                if not(chem in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem, index + 1)
            indexes.append(index)
    if not indexes:
        return ABSTAIN
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "VB"):
            return True
    return ABSTAIN

In [177]:
# testing sep_verb (works when there are several instances)
sep_verb(["cassie cas cassie hi there oxidized cas", ["cassie", "cas"]])
sep_verb(['univ sydney, dept pharm, sydney, nsw 2006, australia abstract two distinct mg2+-atpase activities were isolated from triton x-100-solubilized human erythrocyte membranes using a combination of calmodulin-agarose to remove ca2+-atpase and ion exchange chromatography to separate the mg2+-atpase activities',
 ['ca2+', 'mg2+']])

-1

In [178]:
# test cases
print(sep_verb(["The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity.", ["L-GalDH"]]))
print(sep_verb(["Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy.", ["L-GalDH"]]))
print(sep_verb(["We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000).", ["L-GalL dehydrogenase"]]))
print(sep_verb(["The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL.", ["L-GalDH", "dehydro-AsA", "L-GalL"]]))

-1
-1
-1
True


In [179]:
# LF_sep_adverb
# If the chemicals are separated by a adverb, we label FALSE
@labeling_function()
def sep_adverb(x):
    sentence = x[0].replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in x[1]:
        index = -1
        while (index < len(words)):
            if (" " in chem):
                chem_split = chem.split(" ")
                if not(chem_split[-1] in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem_split[-1], index + 1)
            else:
                if not(chem in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem, index + 1)
            indexes.append(index)
    if not indexes:
        return ABSTAIN
    min_index = min(indexes)
    max_index = max(indexes)
    tagged = nltk.pos_tag(words)
    for index in range(min_index + 1, max_index):
        if (tagged[index][1][:2] == "RB"):
            return False
    return ABSTAIN

In [180]:
# testing sep_adverb (for repeated chemicals)
sep_adverb(["cassie happily danced to the moon with cas", ["cassie", "cas"]])

False

In [181]:
# test cases
print(sep_adverb(["The active fractions from the gel filtration column were further separated by native PAGE, followed by staining for L-GalDH activity.", ["L-GalDH"]]))
print(sep_adverb(["Southern blot analysis revealed that the spinach L-GalDH gene occurs in a single copy.", ["L-GalDH"]]))
print(sep_adverb(["We reported that transcripts of L-GalL dehydrogenase (L-GalLDH), which functions at the last step of the biosynthetic pathway, were expressed in leaves, stems and roots at almost the same level (Yabuta et al. 2000).", ["L-GalL dehydrogenase"]]))
print(sep_adverb(["The L-GalDH activity was not affected by the addition of 1 mM dehydro-AsA (DHA) or 1 mM L-GalL.", ["L-GalDH", "dehydro-AsA", "L-GalL"]]))

-1
-1
-1
False


In [182]:
# LF_includes_oxidation_words
# If the sentence contains oxidation words, we label True
oxidation_words = ["oxidize", "reduce", "oxidization", "redox" "oxidizing", "oxidizes", "oxidize", "reduction",
    "reduces", "oxidise", "oxidising", "oxide", "rust", "corrode"]
@labeling_function()
def includes_oxidation_words(x):
    for word in oxidation_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [183]:
# LF_includes_combustion_words
# If the sentence contains combustion words, we label True
combustion_words = ["combusts", "combustion", "combust", "burn" "explode", "gas-forming", "oxidize", "reduction",
    "reduces", "oxidise", "oxidising", "oxide"]
@labeling_function()
def includes_combustion_words(x):
    for word in combustion_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [184]:
# LF_includes_neutralization_words
# If the sentence contains neutralization words, we label True
neutralization_words = ["neutralize", "neutralization", "titrate", "titration", "buffer"]
@labeling_function()
def includes_neutralization_words(x):
    for word in neutralization_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [185]:
# LF_includes_catalyze_words
# If the sentence contains catalyze words, we label True
catalyze_words = ["catalyze", "catalyst", "enzyme"]
@labeling_function()
def includes_catalyze_words(x):
    for word in catalyze_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [186]:
# LF_includes_combination_words
# If the sentence contains combination words, we label True
combination_words = ["combine", "combines", "combination"]
@labeling_function()
def includes_combination_words(x):
    for word in combination_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [187]:
# LF_includes_decomposition_words
# If the sentence contains decomposition words, we label True
decomposition_words = ["decompose", "decomposition"]
@labeling_function()
def includes_decomposition_words(x):
    for word in decomposition_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [188]:
# LF_includes_replace_words
# If the sentence contains decomposition words, we label True
replace_words = ["replaces", "replace", "replacement"]
@labeling_function()
def includes_replace_words(x):
    for word in replace_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [189]:
# LF_includes_reaction_words
# If the sentence contains reactions words, we label True
reaction_words = ["convert", "yeild", "produce", "make", "react", "create", "synthesize", "conversion",
    "transformation", "transform", "synthesise", "ferment", "agent",
    "methylaion", "displacement", "combination", "exchange",
    "precipitate", "precipitation", "transfer", "through", "by", "produce", "activates"]
@labeling_function()
def includes_reaction_words(x):
    for word in reaction_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [190]:
# LF_includes_reaction_component_words
# If the sentence contains reaction compontents words, we label True
reaction_component_words = ["substrate", "product", "reactant"]
@labeling_function()
def includes_reaction_component_words(x):
    for word in reaction_component_words:
        if(word in x[0]):
            return True
    return ABSTAIN

In [191]:
# LF_comparison
# If the sentence contains reaction compontents words, we label True
comparison_words = ["similar", "more", "greater", "less", "increase", "decrease"]
@labeling_function()
def comparison(x):
    for word in comparison_words:
        if(word in x[0]):
            return False
    return ABSTAIN

In [192]:
# LF_sep_conversion_words
# If the sentence contains to, from, into, etc., we label True
conversion_words = ["to", "from", "into", "becomes", "became"]
@labeling_function()
def sep_converstion_words(x):
    sentence = x[0].replace(',', '')
    words = sentence.split(" ")
    indexes = []
    for chem in x[1]:
        index = -1
        while (index < len(words)):
            if (" " in chem):
                chem_split = chem.split(" ")
                if not(chem_split[-1] in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem_split[-1], index + 1)
            else:
                if not(chem in words[index + 1:]):
                    index = len(words)
                    break
                index = words.index(chem, index + 1)
            indexes.append(index)
    if not indexes:
        return ABSTAIN
    min_index = min(indexes)
    max_index = max(indexes)
    for index in range(min_index + 1, max_index):
        if (words[index] in conversion_words):
            return True
    return ABSTAIN

In [193]:
# test sep_conversion_words
sep_converstion_words(["cassie truns into cas", ["cassie", "cas"]])

True

### my todo + notes

In [34]:
# things to fix
    # create specific reation verb labeling (complete)
        # oxidation
        # reduction
        # combustion
        # composition
        # decomposition
        # etc
    # creating more test cases
    # current test sentences from
        # jacob's csv
        # https://academic.oup.com/pcp/article/45/9/1271/1857717
    # ways to clean data
        # ensure there are at least two chemicals (complete)
        # get rid of the greek alphabet (complete)
        # get rid of periods and () and ; (complete)
        # get rid of repeat chems (complete)
    # need to figure out how to deal with things in the sentence twice (complete)

### testing on data written in this file

In [112]:
# changed data format so must change written data format before running
# for data in test_data:
#     print(data)
#     print("general chemicals")
#     print(general_chemical(data[0]))
#     print("adjacent mentions")
#     print(adjacent_mentions(data[0], data[1]))
#     print("sep or")
#     print(sep_or(data[0], data[1]))
#     print("sep and")
#     print(sep_and(data[0], data[1]))
#     print("sep comma")
#     print(sep_comma(data[0], data[1]))
#     print("sep via")
#     print(sep_via(data[0], data[1]))
#     print("sep sym")
#     print(sep_sym(data[0], data[1]))
#     print("followed ase")
#     print(followed_ase(data[0], data[1]))
#     print("group")
#     print(group(data[1]))
#     print("followed by noun")
#     print(followed_by_noun(data[0], data[1]))
#     print("sep verb")
#     print(sep_verb(data[0], data[1]))
#     print("sep adverb")
#     print(sep_adverb(data[0], data[1]))
#     print()

### importing data in and cleaning it

In [50]:
sentence_df = pd.read_csv("sentence_data_cleaned_jacob.csv")
sentence_with_chem_df = sentence_df.dropna()
several_chem_df = sentence_with_chem_df.loc[sentence_with_chem_df["chemicals"].str.contains(",")]
several_chem_df

Unnamed: 0,doc,sentence_index,sentence,entity_type,label,chemicals,genes,substrates,products,enzymes,label index dict,abstract_clean,abstract_expand,abstract_ordered
197,12067524,0,BACKGROUND AND AIMS: Glutamic acid decarboxyla...,"null, null, null, null, null, null, null, null...","O, O, O, O, B-enzyme, I-enzyme, O, O, B-enzyme...","glutamate, GABA",GAD,glutamate,"gamma-aminobutyric, acid, GABA","Glutamic, acid, GAD, EC, 4.1.1.15","{4: 'B-enzyme', 5: 'I-enzyme', 8: 'B-enzyme', ...",BACKGROUND AND AIMS: Glutamic acid decarboxyla...,['These included a specific enzyme activity of...,['BACKGROUND AND AIMS: Glutamic acid decarboxy...
252,12513997,3,"Interestingly, the allele of PRO1 was shown to...","null, null, null, null, null, GENE-Y, null, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, B-enzym...","gamma-glutamyl, gamma-glutamyl, L-proline, L-g...",PRO1,L-glutamate,L-proline,"gamma-glutamyl, kinase, gamma-glutamyl, phosphate","{13: 'B-enzyme', 14: 'I-enzyme', 16: 'B-enzyme...",We previously isolated a mutant which showed a...,['The approach described in this paper could b...,['We previously isolated a mutant which showed...
282,12668769,8,We concluded that FDH has no direct role in th...,"null, null, null, GENE-Y, null, null, null, nu...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O...","Ser, formate","FDH, FDH",formate,CO,FDH,"{23: 'B-SUBSTRATE', 25: 'B-PRODUCT-OF', 31: 'B...",Serine (Ser) biosynthesis in C(3) plants can o...,"['In shoots, therefore, the pathway from forma...",['Serine (Ser) biosynthesis in C(3) plants can...
318,12742526,6,Even though the activities of MAT and GNMT wer...,"null, null, null, null, null, GENE-N, null, GE...","O, O, O, O, O, B-enzyme, O, B-enzyme, O, O, O,...","S-adenosylmethionine, S-adenosylhomocysteine","MAT, GNMT",S-adenosylmethionine,S-adenosylhomocysteine,"MAT, GNMT","{5: 'B-enzyme', 7: 'B-enzyme', 15: 'B-SUBSTRAT...",Ames dwarf mice (df/df) are deficient in growt...,"['Taken together, the data suggest that methio...",['Ames dwarf mice (df/df) are deficient in gro...
323,12850267,1,One of the enzymes responsible for the product...,"null, null, null, null, null, null, null, null...","O, O, O, O, O, O, O, O, O, B-PRODUCT-OF, O, B-...","KA, kynurenine, glutamine","KATI, GTK",glutamine,"KA, oxoglutaramic, acid","kynurenine, aminotransferase, KATI, GTK, EC, 2...","{9: 'B-PRODUCT-OF', 11: 'B-enzyme', 12: 'I-enz...",Kynurenic acid (KA) is an endogenous glutamate...,['Kynurenic acid (KA) is an endogenous glutama...,['Kynurenic acid (KA) is an endogenous glutama...
439,15132128,0,PURPOSE: The fluoropyrimidine carbamate (capec...,"null, null, null, null, null, null, CHEMICAL, ...","O, O, O, B-SUBSTRATE, I-SUBSTRATE, O, B-SUBSTR...","capecitabine, 5-fluorouracil, 5-FU, thymidine",TP,"fluoropyrimidine, carbamate, capecitabine","5-fluorouracil, 5-FU","thymidine, phosphorylase, TP","{3: 'B-SUBSTRATE', 4: 'I-SUBSTRATE', 6: 'B-SUB...",PURPOSE: The fluoropyrimidine carbamate (capec...,['Favorable enzyme profiles (high TP and low D...,['PURPOSE: The fluoropyrimidine carbamate (cap...
465,15155769,1,Carnitine acetyltransferases (CrAT) catalyze t...,"CHEMICAL, null, null, GENE-Y, null, null, null...","B-enzyme, I-enzyme, O, B-enzyme, O, O, O, O, O...","Carnitine, acetyl-CoA, carnitine, acetylcarnitine",CrAT,"acetyl-CoA, carnitine",acetylcarnitine,"Carnitine, acetyltransferases, CrAT","{0: 'B-enzyme', 1: 'I-enzyme', 3: 'B-enzyme', ...","In eukaryotes, L-carnitine is involved in ener...","['In eukaryotes, L-carnitine is involved in en...","['In eukaryotes, L-carnitine is involved in en..."
484,15689518,0,"L-serine dehydratase (SDH), a member of the be...","CHEMICAL, null, null, GENE-Y, null, null, null...","B-enzyme, I-enzyme, O, B-enzyme, O, O, O, O, O...","L-serine, L-serine, L-threonine, pyruvate, 2-o...",SDH,"L-serine, L-threonine","pyruvate, 2-oxobutyrate","L-serine, dehydratase, SDH","{0: 'B-enzyme', 1: 'I-enzyme', 3: 'B-enzyme', ...","L-serine dehydratase (SDH), a member of the be...","['Furthermore, the activity of hSDH-PLP was as...","['L-serine dehydratase (SDH), a member of the ..."
545,16455797,0,Spermidine/spermine N1-acetyltransferase (SSAT...,"null, null, null, GENE-Y, null, null, null, nu...","B-enzyme, I-enzyme, O, O, O, O, O, O, O, O, O,...","polyamine, spermidine, spermine",SSAT,"spermidine, spermine",polyamine,"Spermidine/spermine, N1-acetyltransferase","{0: 'B-enzyme', 1: 'I-enzyme', 13: 'B-PRODUCT-...",Spermidine/spermine N1-acetyltransferase (SSAT...,['Spermidine/spermine N1-acetyltransferase (SS...,['Spermidine/spermine N1-acetyltransferase (SS...
561,16484281,1,Astrocytes may play a role in these manifestat...,"null, null, null, null, null, null, null, null...","O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O...","glutamate, glutamine, glutamine",GS,glutamate,glutamine,"glutamine, synthetase, GS","{17: 'B-SUBSTRATE', 22: 'B-PRODUCT-OF', 26: 'B...",Excess activation of glutamatergic neurotransm...,['Packing density of GS and GFAP-immunoreactiv...,['Excess activation of glutamatergic neurotran...


In [51]:
sentence_chem_df = several_chem_df[["sentence", "chemicals"]]

In [52]:
greek_alphabet = {
    u'\u0393': 'Gamma',
    u'\u0394': 'Delta',
    u'\u0398': 'Theta',
    u'\u039B': 'Lamda',
    u'\u039E': 'Xi',
    u'\u03A0': 'Pi',
    u'\u03A3': 'Sigma',
    u'\u03A6': 'Phi',
    u'\u03A7': 'Chi',
    u'\u03A8': 'Psi',
    u'\u03A9': 'Omega',
    u'\u03B1': 'alpha',
    u'\u03B2': 'beta',
    u'\u03B3': 'gamma',
    u'\u03B4': 'delta',
    u'\u03B5': 'epsilon',
    u'\u03B6': 'zeta',
    u'\u03B7': 'eta',
    u'\u03B8': 'theta',
    u'\u03B9': 'iota',
    u'\u03BA': 'kappa',
    u'\u03BB': 'lamda',
    u'\u03BC': 'mu',
    u'\u03BD': 'nu',
    u'\u03BE': 'xi',
    u'\u03BF': 'omicron',
    u'\u03C0': 'pi',
    u'\u03C1': 'rho',
    u'\u03C3': 'sigma',
    u'\u03C4': 'tau',
    u'\u03C5': 'upsilon',
    u'\u03C6': 'phi',
    u'\u03C7': 'chi',
    u'\u03C8': 'psi',
    u'\u03C9': 'omega'
}

In [53]:
'β' in greek_alphabet.keys()

True

In [54]:
def remove_greek(words):
    for letter in greek_alphabet.keys():
        words = words.replace(letter, greek_alphabet[letter])
    return words


In [55]:
# greek test
remove_greek("11β-hydroxysteroid")

'11beta-hydroxysteroid'

In [56]:
def chem_into_array(chemicals):
    chemicals = remove_greek(chemicals)
    chemicals = chemicals.lower()
    sample_chems = chemicals.replace("%20", " ")
    sample_chems_list = sample_chems.split(", ")
    for index in range(len(sample_chems_list)):
        while(sample_chems_list[index][-1] == ","):
            sample_chems_list[index] = sample_chems_list[index][:-1]
    sample_chems_list = list(set(sample_chems_list))
    return sample_chems_list

In [57]:
# removing excess commas from the end
chem_into_array("cassie,, cassie, cas")

['cassie', 'cas']

In [58]:
def sentence_cleaned(sentence):
    sentence = remove_greek(sentence)
    sentence = sentence.lower()
    sentence = sentence.replace(".", "")
    sentence = sentence.replace(")", "")
    sentence = sentence.replace("(", "")
    sentence = sentence.replace(";", "")
    sentence = sentence.replace(":", "")
    return sentence

In [59]:
sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["sentence"] = sentence_chem_df["sentence"].apply(sentence_cleaned)


Unnamed: 0,sentence,chemicals
197,background and aims glutamic acid decarboxylas...,"glutamate, GABA"
252,"interestingly, the allele of pro1 was shown to...","gamma-glutamyl, gamma-glutamyl, L-proline, L-g..."
282,we concluded that fdh has no direct role in th...,"Ser, formate"
318,even though the activities of mat and gnmt wer...,"S-adenosylmethionine, S-adenosylhomocysteine"
323,one of the enzymes responsible for the product...,"KA, kynurenine, glutamine"
439,purpose the fluoropyrimidine carbamate capecit...,"capecitabine, 5-fluorouracil, 5-FU, thymidine"
465,carnitine acetyltransferases crat catalyze the...,"Carnitine, acetyl-CoA, carnitine, acetylcarnitine"
484,"l-serine dehydratase sdh, a member of the beta...","L-serine, L-serine, L-threonine, pyruvate, 2-o..."
545,spermidine/spermine n1-acetyltransferase ssat ...,"polyamine, spermidine, spermine"
561,astrocytes may play a role in these manifestat...,"glutamate, glutamine, glutamine"


In [60]:
sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)
sentence_chem_df = sentence_chem_df.loc[sentence_chem_df["chemicals"].str.len() > 1]
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["chemicals"] = sentence_chem_df["chemicals"].apply(chem_into_array)


Unnamed: 0,sentence,chemicals
197,background and aims glutamic acid decarboxylas...,"[glutamate, gaba]"
252,"interestingly, the allele of pro1 was shown to...","[gamma-glutamyl, l-glutamate, l-proline]"
282,we concluded that fdh has no direct role in th...,"[ser, formate]"
318,even though the activities of mat and gnmt wer...,"[s-adenosylhomocysteine, s-adenosylmethionine]"
323,one of the enzymes responsible for the product...,"[kynurenine, ka, glutamine]"
439,purpose the fluoropyrimidine carbamate capecit...,"[thymidine, 5-fu, capecitabine, 5-fluorouracil]"
465,carnitine acetyltransferases crat catalyze the...,"[acetylcarnitine, acetyl-coa, carnitine]"
484,"l-serine dehydratase sdh, a member of the beta...","[l-threonine, pyruvate, 2-oxobutyrate, l-serine]"
545,spermidine/spermine n1-acetyltransferase ssat ...,"[spermidine, spermine, polyamine]"
561,astrocytes may play a role in these manifestat...,"[glutamate, glutamine]"


In [61]:
sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()
sentence_chem_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_chem_df["text"] = sentence_chem_df[["sentence", "chemicals"]].values.tolist()


Unnamed: 0,sentence,chemicals,text
197,background and aims glutamic acid decarboxylas...,"[glutamate, gaba]",[background and aims glutamic acid decarboxyla...
252,"interestingly, the allele of pro1 was shown to...","[gamma-glutamyl, l-glutamate, l-proline]","[interestingly, the allele of pro1 was shown t..."
282,we concluded that fdh has no direct role in th...,"[ser, formate]",[we concluded that fdh has no direct role in t...
318,even though the activities of mat and gnmt wer...,"[s-adenosylhomocysteine, s-adenosylmethionine]",[even though the activities of mat and gnmt we...
323,one of the enzymes responsible for the product...,"[kynurenine, ka, glutamine]",[one of the enzymes responsible for the produc...
439,purpose the fluoropyrimidine carbamate capecit...,"[thymidine, 5-fu, capecitabine, 5-fluorouracil]",[purpose the fluoropyrimidine carbamate capeci...
465,carnitine acetyltransferases crat catalyze the...,"[acetylcarnitine, acetyl-coa, carnitine]",[carnitine acetyltransferases crat catalyze th...
484,"l-serine dehydratase sdh, a member of the beta...","[l-threonine, pyruvate, 2-oxobutyrate, l-serine]","[l-serine dehydratase sdh, a member of the bet..."
545,spermidine/spermine n1-acetyltransferase ssat ...,"[spermidine, spermine, polyamine]",[spermidine/spermine n1-acetyltransferase ssat...
561,astrocytes may play a role in these manifestat...,"[glutamate, glutamine]",[astrocytes may play a role in these manifesta...


### running test on the imported data

In [62]:
for index in range(len(sentence_chem_df["sentence"])):
    arguments = sentence_chem_df["text"].iloc[index]
    print(arguments)
    print("general chemicals: " + str(general_chemical(arguments)))
    print("adjacent mentions: " + str(adjacent_mentions(arguments)))
    print("sep or: " + str(sep_or(arguments)))
    print("sep and: " + str(sep_and(arguments)))
    print("sep comma: " + str(sep_comma(arguments)))
    print("sep via: " + str(sep_via(arguments)))
    print("sep sym: " + str(sep_sym(arguments)))
    print("followed ase: " + str(followed_ase(arguments)))
    print("group: " + str(group(arguments)))
    print("followed by noun: " + str(followed_by_noun(arguments)))
    print("sep verb: " + str(sep_verb(arguments)))
    print("sep adverb: " + str(sep_adverb(arguments)))
    print("includes reaction words: " + str(includes_reaction_words(arguments)))
    print("sep conversion words: " + str(sep_converstion_words(arguments)))
    print()

['background and aims glutamic acid decarboxylase gad, ec 41115 catalyses the conversion of glutamate to gamma-aminobutyric acid gaba', ['glutamate', 'gaba']]
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: -1
sep verb: -1
sep adverb: -1
includes reaction words: True
sep conversion words: True

['interestingly, the allele of pro1 was shown to enhance the activities of gamma-glutamyl kinase and gamma-glutamyl phosphate reductase, both of which catalyze the first two steps of l-proline synthesis from l-glutamate and which together may form a complex in vivo', ['gamma-glutamyl', 'l-glutamate', 'l-proline']]
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: False
group: False
followed by noun: False
sep verb: True
sep adverb: -1
includes reaction words: -1
sep conversion words: True

['we concluded that fdh has no direct 

### new csv uncleaned data

In [63]:
uncleaned_df = pd.read_csv("../sentence_annotations_elsevier_pmid_split6.csv")
uncleaned_df

Unnamed: 0,lit_id,indices,start,end,sentence,sentence_pos,enzymes,enzyme_locations,chemical_entities_full,chemical_names,chemical_smiles,name_smile_tuples
0,10.1002/jps.20686,0,0,1297,serial JL 313843 291210 291727 291789 291928 3...,"[('serial', 'JJ'), ('JL', 'NN'), ('313843', 'C...",,[],[],,,[]
1,10.1002/jps.20686,1,1298,1324,Published by Elsevier Inc.,"[('Published', 'VBN'), ('by', 'IN'), ('Elsevie...",,[],[],,,[]
2,10.1002/jps.20686,2,1325,1345,All rights reserved.,"[('All', 'DT'), ('rights', 'NNS'), ('reserved'...",,[],[],,,[]
3,10.1002/jps.20686,3,1346,9469,KINETICANALYSESFORSPECIESDIFFERENCESINPGLYCOPR...,[('KINETICANALYSESFORSPECIESDIFFERENCESINPGLYC...,,[],"[{'text': 'Diltiazem', 'start': 1718, 'end': 1...","Diltiazem,, Cyclosporin%20A,, Dexamethasone",COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...,"[('Diltiazem', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(..."
4,10.1002/jps.20686,4,9470,9599,Immunoblot analyses of P-gp expressed in MDR1 ...,"[('Immunoblot', 'NN'), ('analyses', 'NNS'), ('...",,[],"[{'text': 'H241', 'start': 9594, 'end': 9598, ...",H241,C[C@]12C[C@](O)(C[C@@H]1CC[C@@H]3[C@@H]2CC[C@@...,"[('H241', 'C[C@]12C[C@](O)(C[C@@H]1CC[C@@H]3[C..."
...,...,...,...,...,...,...,...,...,...,...,...,...
952542,10.1263/jbb.99.623,154,29565,29683,A novel ATP regeneration system using polyphos...,"[('A', 'DT'), ('novel', 'JJ'), ('ATP', 'NN'), ...","phosphotransferase, kinase","[('phosphotransferase', 9, 9), ('kinase', 12, ...","[{'text': 'ATP', 'start': 29573, 'end': 29576,...",ATP,Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P](O)(=O)O[...,"[('ATP', 'Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P..."
952543,10.1263/jbb.99.623,155,29684,29723,91 2001 557 563 20 Fujio T. Maruyama A.,"[('91', 'CD'), ('2001', 'CD'), ('557', 'CD'), ...",,[],[],,,[]
952544,10.1263/jbb.99.623,156,29724,29731,Mori H.,"[('Mori', 'NNP'), ('H', 'NNP'), ('.', '.')]",,[],"[{'text': 'H', 'start': 29729, 'end': 29730, '...",H,[H],"[('H', '[H]')]"
952545,10.1263/jbb.99.623,157,29732,29882,Production of useful substances by the couplin...,"[('Production', 'NN'), ('of', 'IN'), ('useful'...",,[],"[{'text': 'ATP', 'start': 29797, 'end': 29800,...","ATP,, ATP",Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P](O)(=O)O[...,"[('ATP', 'Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P..."


In [64]:
uncleaned_no_na_df = uncleaned_df.dropna()
uncleaned_several_chem_df = uncleaned_no_na_df.loc[uncleaned_no_na_df["chemical_names"].str.contains(",")]
uncleaned_several_chem_df = uncleaned_several_chem_df[["sentence", "chemical_names"]]

In [65]:
uncleaned_several_chem_df["sentence"] = uncleaned_several_chem_df["sentence"].apply(sentence_cleaned)
uncleaned_several_chem_df["chemical_names"] = uncleaned_several_chem_df["chemical_names"].apply(chem_into_array)

In [66]:
uncleaned_several_chem_df["chemical_names"]

150       [calcein-am, cyclosporin a, dexamethasone, dil...
271       [sodium dodecyl sulfate, geranyl pyrophosphate...
280                                                    [h+]
281                                                    [h+]
286                                                    [h+]
                                ...                        
952414                    [vancomycin, d-alanine-d-alanine]
952450                         [tetrahydrofurane, methanol]
952517                                      [sucrose, kato]
952526                     [d-alanyl-d-alanine, vancomycin]
952541                                [adenylate, amp, atp]
Name: chemical_names, Length: 50212, dtype: object

In [67]:
re_check_several_chem_uncleaned_df = uncleaned_several_chem_df.loc[uncleaned_several_chem_df["chemical_names"].str.len() > 1]

In [68]:
re_check_several_chem_uncleaned_df

Unnamed: 0,sentence,chemical_names
150,"we selected diltiazem, cyclosporin a, and dexa...","[calcein-am, cyclosporin a, dexamethasone, dil..."
271,"croteau r washington state univ, inst biol che...","[sodium dodecyl sulfate, geranyl pyrophosphate..."
295,chemicalmodificationschickenliverpyruvatecarbo...,"[lysine, pyruvate, cysteine]"
296,"ash de temple univ, hlth sci ctr, sch med, dep...","[pyruvate, cysteine, lysine, o-phthalaldehyde,..."
298,at a one- to two-fold molar excess over active...,"[pyruvate, oxaloacetate, adp]"
...,...,...
952414,key words d-alanine-d-alanine ligase d-amino a...,"[vancomycin, d-alanine-d-alanine]"
952450,the mobile phase consisted of a linear gradien...,"[tetrahydrofurane, methanol]"
952517,weber v falkenhagen d subpol a novel sucrose-b...,"[sucrose, kato]"
952526,active-site mutants of the vanc2 d-alanyl-d-se...,"[d-alanyl-d-alanine, vancomycin]"


In [69]:
re_check_several_chem_uncleaned_df["text"] = re_check_several_chem_uncleaned_df[["sentence", "chemical_names"]].values.tolist()
re_check_several_chem_uncleaned_df.iloc[47]["text"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  re_check_several_chem_uncleaned_df["text"] = re_check_several_chem_uncleaned_df[["sentence", "chemical_names"]].values.tolist()


['univ sydney, dept pharm, sydney, nsw 2006, australia abstract two distinct mg2+-atpase activities were isolated from triton x-100-solubilized human erythrocyte membranes using a combination of calmodulin-agarose to remove ca2+-atpase and ion exchange chromatography to separate the mg2+-atpase activities',
 ['ca2+', 'mg2+']]

In [70]:
for index in range(500, 520):
    sentence = re_check_several_chem_uncleaned_df["text"].iloc[index]
    print(arguments)
    print("general chemicals: " + str(general_chemical(arguments)))
    print("adjacent mentions: " + str(adjacent_mentions(arguments)))
    print("sep or: " + str(sep_or(arguments)))
    print("sep and: " + str(sep_and(arguments)))
    print("sep comma: " + str(sep_comma(arguments)))
    print("sep via: " + str(sep_via(arguments)))
    print("sep sym: " + str(sep_sym(arguments)))
    print("followed ase: " + str(followed_ase(arguments)))
    print("group: " + str(group(arguments)))
    print("followed by noun: " + str(followed_by_noun(arguments)))
    print("sep verb: " + str(sep_verb(arguments)))
    print("sep adverb: " + str(sep_adverb(arguments)))
    print("includes reaction words: " + str(includes_reaction_words(arguments)))
    print("includes reaction component words: " + str(includes_reaction_component_words(arguments)))
    print("sep conversion words: " + str(sep_converstion_words(arguments)))
    print()

['6-dhsg was metabolised by gsh to form a gsh conjugate gs-6-dhsg in raw 2647 cells, via a potential mechanism involving the catalytic activity of glutathione-s-transferase gst', ['gsh', '6-dhsg', 'gs-6-dhsg']]
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: False
sep verb: True
sep adverb: -1
includes reaction words: True
includes reaction component words: -1
sep conversion words: True

['6-dhsg was metabolised by gsh to form a gsh conjugate gs-6-dhsg in raw 2647 cells, via a potential mechanism involving the catalytic activity of glutathione-s-transferase gst', ['gsh', '6-dhsg', 'gs-6-dhsg']]
general chemicals: -1
adjacent mentions: -1
sep or: -1
sep and: -1
sep comma: -1
sep via: -1
sep sym: -1
followed ase: -1
group: -1
followed by noun: False
sep verb: True
sep adverb: -1
includes reaction words: True
includes reaction component words: -1
sep conversion words: True

['6-dhsg was m

### snorkel code for model

In [71]:
from snorkel.labeling import PandasLFApplier

In [197]:
# Define the set of labeling functions (LFs)
lfs = [solution_words, mixture_words, physical_words, gene_words, structural_words, general_chemical, group,
    includes_amino_acids, paper_artifacts, no_terms, adjacent_mentions, sep_or, sep_and, sep_comma, sep_via,
    sep_sym, followed_ase, followed_by_noun, sep_verb, sep_adverb, includes_oxidation_words,
    includes_combustion_words, includes_neutralization_words, includes_catalyze_words, includes_combination_words,
    includes_decomposition_words, includes_replace_words, includes_reaction_words,
    includes_reaction_component_words, comparison, sep_converstion_words]

# Apply the LFs to the unlabeled training data
df_train = re_check_several_chem_uncleaned_df[:100]
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 100/100 [00:01<00:00, 61.73it/s]


In [198]:
L_train

array([[-1, -1,  0, ...,  1, -1,  1],
       [-1, -1,  0, ..., -1, -1,  1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1, -1,  0, ..., -1, -1,  1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1]])

In [199]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
solution_words,0,[0],0.01,0.01,0.01
mixture_words,1,[0],0.01,0.01,0.01
physical_words,2,[0],0.27,0.27,0.24
gene_words,3,[0],0.26,0.26,0.22
structural_words,4,[0],0.37,0.36,0.3
general_chemical,5,[0],0.56,0.56,0.49
group,6,[0],0.21,0.21,0.19
includes_amino_acids,7,[0],0.11,0.11,0.11
paper_artifacts,8,[0],0.25,0.25,0.21
no_terms,9,[0],0.21,0.21,0.21


In [76]:
from snorkel.labeling.model import MajorityLabelVoter

In [200]:
from snorkel.labeling.model import LabelModel

In [201]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")


In [202]:
df_train["label"].value_counts()

0    79
1    21
Name: label, dtype: int64

In [204]:
df_train.to_csv(r'../labeled.csv')