# Detecting spouse mentions in sentences

In [1]:
# Snorkel Introduction

from collections import OrderedDict 
from glob import glob
import os
import sys

import cupy
# import dask.dataframe as dd
import numpy as np
import pandas as pd
import pyarrow
import random
import snorkel
import spacy
import tensorflow as tf

# Add parent directory to path
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

# Make reproducible
random.seed(1337)

# Turn off TensorFlow logging messages
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# For reproducibility
os.environ["PYTHONHASHSEED"] = "1337"

In [2]:
TAG_LIMIT = 50

In [3]:
PATHS = {
    'questions': {
        'local': '../../data/stackoverflow/Questions.Tags.{}.parquet/part-00029-1ad544ea-abd4-4960-aa2c-7e0eb12cdb8e-c000.snappy.parquet',
        'local_single': '../../data/stackoverflow/Questions.Tags.{}.parquet',
        's3': 's3://stackoverflow-events/08-05-2019/Questions.Tags.{}.parquet',
    }
}

# Define a set of paths for each step for local and S3
PATH_SET = 'local_single' # 'local', 's3'

path = PATHS['questions'][PATH_SET].format(TAG_LIMIT)

In [4]:
# Pandas
df = pd.read_parquet(
    path, 
    engine='pyarrow',
    
)
df.head(3)

Unnamed: 0,_PostId,_AcceptedAnswerId,_Body,_Code,_Tags,_Label,_AnswerCount,_CommentCount,_FavoriteCount,_OwnerUserId,...,_AccountId,_UserId,_UserDisplayName,_UserDownVotes,_UserLocation,_ProfileImageUrl,_UserReputation,_UserUpVotes,_UserViews,_UserWebsiteUrl
0,264,,BerkeleyDB Concurrency \nWhat's the optimal le...,,"[c++, berkeley-db]",0,5,0,1.0,104,...,86,104,Ted Dziuba,4,California,,1600,9,2325,http://www.teddziuba.com/
1,1289124,1289185.0,Python equivalent of Jstack? Is there a python...,,[python],0,1,1,,104,...,86,104,Ted Dziuba,4,California,,1600,9,2325,http://www.teddziuba.com/
2,1545263,1545599.0,"UTF-8 In Python logging, how? I'm trying to lo...",import logging\n\ndef logging_test():\n han...,"[python, logging, unicode]",0,4,3,10.0,104,...,86,104,Ted Dziuba,4,California,,1600,9,2325,http://www.teddziuba.com/


In [None]:
# PySpark initialization and data loading
from pyspark import SparkContext
import pyspark.sql.functioSparkSessionF
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Programming Language Extraction Example") \
    .config('spark.dynamicAllocation.enabled', True) \
    .config('spark.shuffle.service.enabled', True) \
    .getOrCreate()
sc = spark.sparkContext

path = PATHS['questions']['local'].format(TAG_LIMIT)

question_df = spark.read.parquet(path)
question_df.limit(5).toPandas()

In [29]:
# Enable GPU support
# spacy.prefer_gpu()

# Download the spaCy english model
spacy.cli.download('en_core_web_lg')
nlp = spacy.load("en_core_web_lg", disable=["vectors"])

from spacy.pipeline import merge_entities

nlp.add_pipe(merge_entities)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [6]:
from spacy import displacy

s = 'The program to do payroll was written in C++ and Perl.'
d = nlp(s)
tups = []
for t in d:
    tups.append((t.text, t.pos_))

# Print words/parts-of-speech
print([x for x in tups])

# Render image diagrams
displacy.render(d, style='dep', options={'compact': True, 'collapse_punct': True, 'distance': 90}, )
displacy.render(d, style='ent')

[('The', 'DET'), ('program', 'NOUN'), ('to', 'PART'), ('do', 'AUX'), ('payroll', 'NOUN'), ('was', 'AUX'), ('written', 'VERB'), ('in', 'ADP'), ('C++', 'PROPN'), ('and', 'CCONJ'), ('Perl', 'PROPN'), ('.', 'PUNCT')]


In [16]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

pattern = [{'POS': 'VERB'}, {'POS': 'ADP'}, {'POS': 'PROPN'}]
matcher.add("VERB_ADP_PROPN", None, pattern)

for d in df['_Body'][0:40]:
    doc = nlp(d)
    matches = matcher(doc)

    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        for w in span:
            print(w.text, w.pos_, w.ent_type_)

come VERB 
from ADP 
Java & C# PROPN ORG
run VERB 
on ADP 
Java 1.4 PROPN PRODUCT
run VERB 
on ADP 
JRE 1.5 PROPN PRODUCT
running VERB 
on ADP 
Java 1.4 PROPN LAW


In [None]:
#
# Pandas - produce the records in the demo for all entities we detect
#
window = 5
candidates = []
for index, row in df.iterrows():
    doc = nlp(row['_Body'])
    re_doc_1 = nlp(row['body'])
    re_doc_2 = nlp(row['body'])
    
    for ent in doc.ents:
        rec = {}
        rec['body'] = doc.text
        rec['entity'] = ent
        rec['entity_text'] = ent.text
        rec['entity_start'] = ent.start
        rec['entity_end'] = ent.end
        rec['ent_type'] = ent.label_

        left_token_start = max(0, ent.start - 1 - window)
        left_token_end = ent.start
        rec['left_tokens_text'] = [x.text for x in doc[left_token_start : left_token_end]]
        rec['left_text'] = re_doc_1[left_token_start : left_token_end].merge()

        right_token_start = min(ent.end, len(doc) - 1)
        right_token_end = min(ent.end + window, len(doc) - 1)
        rec['right_tokens_text'] = [x.text for x in doc[right_token_start : right_token_end]]
        rec['right_text'] = re_doc_2[right_token_start : right_token_end].merge()

        rec['wikidata_id'] = ent.kb_id
        
        rec['original_index'] = index
        rec['label'] = 0

        candidates.append(rec)

df_out = pd.DataFrame(candidates)
df_out = df_out.reindex().sort_index()

df_out.head()

In [None]:
#
# PySpark - produce the records in the demo for all entities we detect
#
from typing import String

window = 5



def prepare_docs(row: String):
    d = row.asDict()
    

entity_df = question_df.rdd.map(prepare_docs)

candidates = []
for index, row in df.iterrows():
    doc = nlp(row['_Body'])
    re_doc_1 = nlp(row['body'])
    re_doc_2 = nlp(row['body'])
    
    for ent in doc.ents:
        rec = {}
        rec['body'] = doc.text
        rec['entity'] = ent
        rec['entity_text'] = ent.text
        rec['entity_start'] = ent.start
        rec['entity_end'] = ent.end
        rec['ent_type'] = ent.label_

        left_token_start = max(0, ent.start - 1 - window)
        left_token_end = ent.start
        rec['left_tokens_text'] = [x.text for x in doc[left_token_start : left_token_end]]
        rec['left_text'] = re_doc_1[left_token_start : left_token_end].merge()

        right_token_start = min(ent.end, len(doc) - 1)
        right_token_end = min(ent.end + window, len(doc) - 1)
        rec['right_tokens_text'] = [x.text for x in doc[right_token_start : right_token_end]]
        rec['right_text'] = re_doc_2[right_token_start : right_token_end].merge()

        rec['wikidata_id'] = ent.kb_id
        
        rec['original_index'] = index
        rec['label'] = 0

        candidates.append(rec)

df_out = pd.DataFrame(candidates)
df_out = df_out.reindex().sort_index()

df_out.head()

In [20]:
# df_out.to_parquet(
#     '../../data/text_extractions.one_file.df_out.parquet',
#     engine='pyarrow'
# )
df_out = pd.read_parquet(
    '../../data/text_extractions.one_file.df_out.parquet',
    engine='pyarrow'
)
df_out['label'] = 0
df_out.head()

Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label
0,BerkeleyDB Concurrency \nWhat's the optimal le...,C++,"[optimal, level, of, concurrency, that, the]","[implementation, of, BerkeleyDB, can, reasonably]",12,LANGUAGE,0,13,0,0
1,Python equivalent of Jstack? Is there a python...,Jstack,"[Python, equivalent, of]","[?, Is, there, a, python]",3,PERSON,0,4,1,0
2,"UTF-8 In Python logging, how? I'm trying to lo...",Python,"[encoded, string, to, a, file, using]","['s, logging, package, ., ]",20,ORG,0,21,2,0
3,"UTF-8 In Python logging, how? I'm trying to lo...",Python,"[\n, At, a, lower, level, ,]","['s, logging, package, is, using]",49,ORG,0,50,2,0
4,"UTF-8 In Python logging, how? I'm trying to lo...",Python,"[which, explodes, ., , Essentially, ,]","[is, doing, this, :, \n\n]",104,ORG,0,105,2,0


In [None]:
# df_out.to_csv('../../data/text_extractions.one_file.df_out.csv')

In [135]:
import ast

df_gold = pd.read_csv('../../data/text_extractions.one_file.df_out.gold.labeled.csv')

# Drop the index column, we have an index set
df_gold = df_gold.drop(['Unnamed: 0'], axis=1)

df_gold['left_tokens_text'] = df_gold['left_tokens_text'].apply(lambda x: ast.literal_eval(x))
df_gold['right_tokens_text'] = df_gold['right_tokens_text'].apply(lambda x: ast.literal_eval(x))

df_gold.tail()

Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label
1285,What features are supported by Android's Google accounts authenticator? The API documentation fo...,API,"['s, Google, accounts, authenticator, ?, The]","[documentation, for, the, , method]",12,ORG,0,13,305,0
1286,What features are supported by Android's Google accounts authenticator? The API documentation fo...,Android,"[documentation, for, the, , method, of]","['s, , has, the, following]",19,ORG,0,20,305,0
1287,What features are supported by Android's Google accounts authenticator? The API documentation fo...,Google,"[are, used, to, tell, \n , whether]","[accounts, have, a, particular, service]",61,ORG,0,62,305,0
1288,What features are supported by Android's Google accounts authenticator? The API documentation fo...,Google,"[a, particular, service, (, such, as]","[\n , Calendar, or, Google Talk, )]",70,ORG,0,71,305,0
1289,What features are supported by Android's Google accounts authenticator? The API documentation fo...,Google Talk,"[such, as, Google, \n , Calendar, or]","[), enabled, ., The, feature]",74,ORG,0,75,305,0


In [22]:
df_in = df_out.iloc[df_gold.index[-1] + 1:, :]
df_in.head()

Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label
1290,What features are supported by Android's Googl...,Google,"[is, for, the, authenticator, used, for]","[accounts, ?, , I, would]",118,ORG,0,119,305,0
1291,Can't find an option to set a view colour usin...,RGB,"[to, set, a, view, colour, using]","[or, hex, in, XCode, 4.2]",11,ORG,0,12,306,0
1292,Can't find an option to set a view colour usin...,XCode,"[colour, using, RGB, or, hex, in]","[4.2, In, XCode, 4.2, ,]",15,ORG,0,16,306,0
1293,Can't find an option to set a view colour usin...,4.2,"[using, RGB, or, hex, in, XCode]","[In, XCode, 4.2, ,, I]",16,CARDINAL,0,17,306,0
1294,Can't find an option to set a view colour usin...,XCode,"[or, hex, in, XCode, 4.2, In]","[4.2, ,, I, want, to]",18,ORG,0,19,306,0


In [136]:
from multiprocessing import Pool

def process_split(df, window=5):

    indexes = []
    out_rows = []
    for index, row in df.iterrows():
        doc = nlp(row['body'])
        re_doc_1 = nlp(row['body'])
        re_doc_2 = nlp(row['body'])

        out_row = row.copy()

        entity = None
        for ent in doc.ents:
            if  ent.start == row['entity_start'] \
            and ent.end   == row['entity_end']:
                entity = ent

        if entity is None:
            raise Exception('Missing entity!')

        # Comment me out once I do this in the above code
        left_token_start = max(0, entity.start - 1 - window)
        left_token_end = entity.start
        left_merged_token = re_doc_1[left_token_start: left_token_end].merge()
        out_row['left_text'] = left_merged_token.text if left_merged_token else ''

        # Comment me out once I do this in the above code
        right_token_start = min(entity.end, len(doc) - 1)
        right_token_end = min(entity.end + window, len(doc) - 1)
        right_merged_token = re_doc_2[right_token_start: right_token_end].merge()
        out_row['right_text'] = right_merged_token.text if right_merged_token else ''

        out_rows.append(out_row)
        indexes.append(index)

    df_out = pd.DataFrame(out_rows, index=indexes)
    return df_out

def restore_spacy(df, n_cores=12):
    
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    
    df_out = pd.concat(
        pool.map(
            process_split,
            df_split
        )
    )
    
    pool.close()
    pool.join()
    
    return df_out

In [137]:
df_gold = restore_spacy(df_gold)

df_gold.to_csv('../../data/text_extractions.one_file.df_out.gold.labeled.final.csv')

df_gold.head()

Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label,left_text,right_text
0,BerkeleyDB Concurrency \nWhat's the optimal level of concurrency that the C++ implementation of ...,C++,"[optimal, level, of, concurrency, that, the]","[implementation, of, BerkeleyDB, can, reasonably]",12,LANGUAGE,0,13,0,1,optimal level of concurrency that the,implementation of BerkeleyDB can reasonably
1,Python equivalent of Jstack? Is there a python equivalent of jstack? I've got a hung process and...,Jstack,"[Python, equivalent, of]","[?, Is, there, a, python]",3,PERSON,0,4,1,0,Python equivalent of,? Is there a python
2,"UTF-8 In Python logging, how? I'm trying to log a UTF-8 encoded string to a file using Python's ...",Python,"[encoded, string, to, a, file, using]","['s, logging, package, ., ]",20,ORG,0,21,2,1,encoded string to a file using,'s logging package.
3,"UTF-8 In Python logging, how? I'm trying to log a UTF-8 encoded string to a file using Python's ...",Python,"[\n, At, a, lower, level, ,]","['s, logging, package, is, using]",49,ORG,0,50,2,1,"\nAt a lower level,",'s logging package is using
4,"UTF-8 In Python logging, how? I'm trying to log a UTF-8 encoded string to a file using Python's ...",Python,"[which, explodes, ., , Essentially, ,]","[is, doing, this, :, \n\n]",104,ORG,0,105,2,1,"which explodes. Essentially,",is doing this:\n\n


In [140]:
print(df_gold.iloc[100]['body'])
df_gold.iloc[100]['right_tokens_text']
#print(df_gold.iloc[100]['right_text'])

#df_gold.iloc[100]

How to enable IBM Websphere Application Management Service? I try to use the IBM Websphere  (and the Ant tasks) to install/update an application EAR on a remote server. You may want to read this question too.
Manual process
I open a jython console with this command line:

After that I want to list all applications:

I think the message here is clear: The application management service is not running.
How to enable the Application Management Service?
I did search for documentation on the horrible, horrible IBM website. I also tried to click through the configuration options on the Websphere admin pages. But I can't find anything remotely close to application management service. I do that clicking again with english language settings now, but I'd appreciate if someone can point me to the configuration option or the documentation.



['language', 'settings', 'now', ',', 'but']

In [33]:
df_in_fixed = restore_spacy(df_in)
df_in_fixed.head()

Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label,left_text,right_text
1290,What features are supported by Android's Googl...,Google,"[is, for, the, authenticator, used, for]","[accounts, ?, , I, would]",118,ORG,0,119,305,0,is for the authenticator used for,accounts? I would
1291,Can't find an option to set a view colour usin...,RGB,"[to, set, a, view, colour, using]","[or, hex, in, XCode, 4.2]",11,ORG,0,12,306,0,to set a view colour using,or hex in XCode 4.2
1292,Can't find an option to set a view colour usin...,XCode,"[colour, using, RGB, or, hex, in]","[4.2, In, XCode, 4.2, ,]",15,ORG,0,16,306,0,colour using RGB or hex in,"4.2 In XCode 4.2,"
1293,Can't find an option to set a view colour usin...,4.2,"[using, RGB, or, hex, in, XCode]","[In, XCode, 4.2, ,, I]",16,CARDINAL,0,17,306,0,using RGB or hex in XCode,"In XCode 4.2, I"
1294,Can't find an option to set a view colour usin...,XCode,"[or, hex, in, XCode, 4.2, In]","[4.2, ,, I, want, to]",18,ORG,0,19,306,0,or hex in XCode 4.2 In,"4.2, I want to"


In [37]:
len(df_in_fixed.index), len(df_gold.index)

(158786, 1290)

In [39]:
from sklearn.model_selection import train_test_split

df_train, df_test, y_train, y_test = train_test_split(
    df_in_fixed, 
    df_in_fixed['label'].values, 
    test_size=0.3,
    random_state=1337,
)

len(df_train.index), len(df_test.index), y_train.shape, y_test.shape

(111150, 47636, (111150,), (47636,))

## In this tutorial, we will see how Snorkel can be used for Information Extraction. We will walk through an example text classification task for information extraction, where we use labeling functions involving keywords and distant supervision.

### Classification Task
<img src="imgs/sentence.jpg" width="700px;" onerror="this.onerror=null; this.src='/doks-theme/assets/images/sentence.jpg';" align="center" style="display: block; margin-left: auto; margin-right: auto;">

We want to classify each __candidate__ or pair of people mentioned in a sentence, as being married at some point or not.

In the above example, our candidate represents the possible relation `(Barack Obama, Michelle Obama)`. As readers, we know this mention is true due to external knowledge and the keyword of `wedding` occuring later in the sentence.
We begin with some basic setup and data downloading.


In [None]:
# %matplotlib inline

# import os
# import pandas as pd
# import pickle

# if os.path.basename(os.getcwd()) == "snorkel-tutorials":
#     os.chdir("spouse")

In [None]:
# from utils import load_data

# ((tf_dev, ty_dev), tf_train, (tf_test, ty_test)) = load_data()

In [40]:
# import pickle

# test_data = pickle.load(open('data/dev_data.pkl', 'rb'))
# test_data.head()

# test_data

**Input Data:** `df_dev`, `df_train`, and `df_test` are `Pandas DataFrame` objects, where each row represents a particular __candidate__. For our problem, a candidate consists of a sentence, and two people mentioned in the sentence. The DataFrames contain the fields `sentence`, which refers to the sentence of the candidate, `tokens`, the tokenized form of the sentence, and `person1_word_idx` and `person2_word_idx`, which represent `[start, end]` indices in the tokens at which the first and second person's name appear, respectively.

We also have certain **preprocessed fields**, that we discuss a few cells below.

Let's look at a candidate in the development set:

In [None]:
# from preprocessors import get_person_text

# candidate = tf_dev.loc[2]
# person_names = get_person_text(candidate).person_names

# print("Sentence: ", candidate["sentence"])
# print("Person 1: ", person_names[0])
# print("Person 2: ", person_names[1])

### Preprocessing the Data

In a real application, there is a lot of data preparation, parsing, and database loading that needs to be completed before we generate candidates and dive into writing labeling functions. Here we've pre-generated candidates in a pandas DataFrame object per split (train,dev,test).

### Labeling Function Helpers

When writing labeling functions, there are several functions you will use over and over again. In the case of text relation extraction as with this task, common functions include those for fetching text between mentions of the two people in a candidate, examing word windows around person mentions, and so on. We will wrap these functions as `preprocessors`.

In [None]:
# from snorkel.preprocess import preprocessor


# @preprocessor()
# def get_text_between(cand):
#     """
#     Returns the text between the two person mentions in the sentence for a candidate
#     """
#     start = cand.person1_word_idx[1] + 1
#     end = cand.person2_word_idx[0]
#     cand.text_between = " ".join(cand.tokens[start:end])
#     return cand

### Candidate PreProcessors

For the purposes of the tutorial, we have three fields (`between_tokens`, `person1_right_tokens`, `person2_right_tokens`) preprocessed in the data, which can be used when creating labeling functions. We also provide the following set of `preprocessor`s for this task in `preprocessors.py`, along with the fields these populate.
* `get_person_text(cand)`: `person_names`
* `get_person_lastnames(cand)`: `person_lastnames`
* `get_left_tokens(cand)`: `person1_left_tokens`, `person2_left_tokens`

In [141]:
# Labels for language extraction

POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1

In [142]:
import re
import jsonlines, sys
from snorkel.labeling import labeling_function, LabelingFunction
from snorkel.preprocess import preprocessor
from snorkel.preprocess.nlp import SpacyPreprocessor

spacy = SpacyPreprocessor(
    text_field='body',
    doc_field='spacy',
    memoize=True,
    language='en_core_web_lg',
    disable=['vectors']
)

@preprocessor(memoize=True, pre=[spacy])
def restore_entity(x):
    
    entity = None
    for ent in x['spacy'].ents:
        if  ent.start == row['entity_start'] \
        and ent.end   == row['entity_end']:
            entity = ent

    if entity is None:
        raise Exception('Missing entity!')

    x['entity'] = entity
    return x

starts_rx = re.compile('^\W')
          
@labeling_function()
def lf_starts_with_char(x):
    """NEGATIVE if starts with a non-alpha-numeric value"""
    return NEGATIVE if starts_rx.match(x['entity_text']) else ABSTAIN


number_end_rx = re.compile('^[a-zA-Z]+[0-9\W]+$')

@labeling_function()
def lf_ends_with_symbol_or_number(x):
    """POSITIVE if starts with letter and ends in number"""
    return POSITIVE if number_end_rx.match(x['entity_text']) else ABSTAIN

@labeling_function()
def lf_wrong_entity_type(x):
    return NEGATIVE if x['ent_type'] in ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 
                                         'LAW', 'DATE', 'TIME', 'PERCENT',
                                         'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL',] else ABSTAIN

@labeling_function()
def lf_token_count_2(x):
    """NEGATIVE if entity has more than 2 words in it"""
    return NEGATIVE if len(x['entity_text'].split(' ')) > 2 else ABSTAIN

@labeling_function()
def lf_token_count_1(x):
    """NEGATIVE if entity has more than 1 word in it"""
    return NEGATIVE if len(x['entity_text'].split(' ')) > 1 else ABSTAIN

In [143]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{'POS': 'VERB'}, {'POS': 'ADP'}, {'POS': 'PROPN'}]
matcher.add("VERB_ADP_PROPN", None, pattern)

@labeling_function(pre=[spacy, restore_entity])
def lf_verb_in_noun(x):
    """Return positive if the pattern"""
    sp = x['spacy']
    matches = matcher(sp)
    
    found = False
    for match_id, start, end in matches:
        if end == x['entity_end']:
            pass
        if start == x['start'] - 2:            
            if sp[start].text in ['work', 'written', 'wrote']:                
                if sp[start + 1].text in ['in']:
                    return POSITIVE
    else:
        return ABSTAIN

In [144]:
#
# Make keyword LF generation
#
def keyword_lookup(x, keywords, field, label):
    """Perform lowercase matching for keyword LFs"""
    match = any(word.lower() in x[field].lower() for word in keywords)
    if match:
        return label
    return ABSTAIN

def make_keyword_lf(keywords, field='body', label=ABSTAIN):
    """Given keywords, a field to match against and a label to return, return an keyword LF"""
    return LabelingFunction(
        name=f"keyword_{keywords}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, field=field, label=label),
    )

# Define keyword LFs
language_keyword_lf = make_keyword_lf(['language'], 'left_text', label=POSITIVE)
written_keyword_lf = make_keyword_lf(['written'], 'left_text', label=POSITIVE)
framework_keyword_lf = make_keyword_lf(['framework', 'package'], 'right_text', label=NEGATIVE)

#
# Use regular expressions to negate browsers
#
prefixes = ['internet', 'ie', 'firefox', 'google', 'chrome', 'apple', 'safari', 'webkit', 'gecko', 
            'opera', 'netscape', 'chromium', ]
browser_rx = re.compile(''.join(['^(?:', '|'.join(prefixes), ')']))

@labeling_function()
def lf_not_browser(x):
    """Eliminate browser false positives"""
    e = x['entity_text'].lower()
    return NEGATIVE if browser_rx.match(e) else ABSTAIN

In [145]:
#
# Label functions using distant supervision from SPARQL/WikiData for programming languages
#
languages, lower_languages = None, None
with jsonlines.open('../../data/programming_languages.jsonl', mode='r') as reader:
    languages = [x['name'] for x in reader]
    lower_languages = [x.lower() for x in languages]

@labeling_function(resources=dict(languages=languages))
def lf_matches_wikidata_langs(x, languages):
    """POSITIVE if the entity_text matches any language in list"""
    return POSITIVE if x.entity_text in languages else ABSTAIN

@labeling_function(resources=dict(lower_languages=lower_languages))
def lf_lower_matches_wikidata_langs(x, lower_languages):
    """POSITIVE if the lowercase entity_text matches any lowercase language in list"""
    return POSITIVE if x.entity_text.lower() in lower_languages else ABSTAIN

# Label functions using distant supervision from SPARQL/WikiData for operating systems
oses, os_parts = [], []
with jsonlines.open('../../data/operating_systems.jsonl', mode='r') as reader:
    oses = [x['name'].lower() for x in reader]
    for os in oses:
        for os_part in os.split():
            os_parts.append(os_part)

@labeling_function(resources=dict(oses=oses))
def lf_matches_wikidata_os(x, oses):
    """NEGATIVE if the lowercase entity_text matches any lowercase OS in the list"""
    return NEGATIVE if x.entity_text.lower() in oses else ABSTAIN

@labeling_function(resources=dict(os_parts=os_parts))
def lf_matches_wikidata_os_parts(x, os_parts):
    """NEGATIVE if the lowercase entity_text matches any lowercase OS fragment in the list"""
    return NEGATIVE if x.entity_text.lower() in os_parts else ABSTAIN

In [146]:
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_matches_wikidata_langs,
    lf_lower_matches_wikidata_langs,
    lf_matches_wikidata_os,
    lf_matches_wikidata_os_parts,
    lf_not_browser,
    lf_starts_with_char,
    lf_wrong_entity_type,
    lf_token_count_2,
    lf_token_count_1,
    # lf_verb_in_noun,
    language_keyword_lf,
    written_keyword_lf,
    framework_keyword_lf,
    lf_ends_with_symbol_or_number,
]
applier = PandasLFApplier(lfs)

from snorkel.labeling import LFAnalysis

L_dev = applier.apply(df_gold)
y_dev = df_gold.label.values

LFAnalysis(L_dev, lfs).lf_summary(y_dev)

  from pandas.core.frame import DataFrame




  0%|          | 0/1290 [00:00<?, ?it/s][A[A[A[A



100%|██████████| 1290/1290 [00:00<00:00, 6938.74it/s][A[A[A[A


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_matches_wikidata_langs,0,[1],0.07907,0.07907,0.01938,92,10,0.901961
lf_lower_matches_wikidata_langs,1,[1],0.092248,0.088372,0.027132,103,16,0.865546
lf_matches_wikidata_os,2,[0],0.002326,0.002326,0.000775,3,0,1.0
lf_matches_wikidata_os_parts,3,[0],0.13876,0.105426,0.017829,163,16,0.910615
lf_not_browser,4,[0],0.011628,0.009302,0.0,15,0,1.0
lf_starts_with_char,5,[0],0.009302,0.009302,0.0,12,0,1.0
lf_wrong_entity_type,6,[0],0.417829,0.17907,0.032558,504,35,0.935065
lf_token_count_2,7,[0],0.05814,0.05814,0.00155,75,0,1.0
lf_token_count_1,8,[0],0.154264,0.111628,0.016279,193,6,0.969849
keyword_['language'],9,[1],0.004651,0.003876,0.002326,2,4,0.333333


In [114]:
# Apply the LFs to the training data
L_train = applier.apply(df_train)
L_test = applier.apply(df_test)

  from pandas.core.frame import DataFrame




  0%|          | 0/47636 [00:00<?, ?it/s][A[A[A[A



  0%|          | 215/47636 [00:00<00:22, 2146.98it/s][A[A[A[A



  2%|▏         | 909/47636 [00:00<00:17, 2707.63it/s][A[A[A[A



  3%|▎         | 1592/47636 [00:00<00:13, 3306.21it/s][A[A[A[A



  5%|▍         | 2277/47636 [00:00<00:11, 3913.22it/s][A[A[A[A



  6%|▌         | 2963/47636 [00:00<00:09, 4491.08it/s][A[A[A[A



  8%|▊         | 3643/47636 [00:00<00:08, 5000.44it/s][A[A[A[A



  9%|▉         | 4325/47636 [00:00<00:07, 5433.91it/s][A[A[A[A



 11%|█         | 5017/47636 [00:00<00:07, 5807.18it/s][A[A[A[A



 12%|█▏        | 5699/47636 [00:00<00:06, 6076.85it/s][A[A[A[A



 13%|█▎        | 6381/47636 [00:01<00:06, 6281.62it/s][A[A[A[A



 15%|█▍        | 7080/47636 [00:01<00:06, 6478.37it/s][A[A[A[A



 16%|█▋        | 7770/47636 [00:01<00:06, 6597.54it/s][A[A[A[A



 18%|█▊        | 8447/47636 [00:01<00:05, 6639.08it/s][A[

In [97]:
(L_train != ABSTAIN).mean(axis=0)

array([0.04717949, 0.06122357, 0.00394062, 0.15505173, 0.02105263,
       0.00808817, 0.45708502, 0.06010796, 0.15558255, 0.00154746,
       0.00195232, 0.00553306, 0.04907782])

In [98]:
from snorkel.analysis import get_label_buckets

buckets = get_label_buckets(y_dev, L_dev[:, 1])

df_gold.iloc[buckets[NEGATIVE, POSITIVE]]

Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label,left_text,right_text
138,Do you use Amazons Cloud services for your company? I read a lot about the possibilities of Amaz...,S3,"['possibilities', 'of', 'Amazons Cloud', 'computing', ',', 'like']","['or', 'EC2', 'and', 'I', 'wondered']",21,ORG,0,22,38,0,"possibilities of Amazons Cloud computing, like",or EC2 and I wondered
143,Do you use Amazons Cloud services for your company? I read a lot about the possibilities of Amaz...,S3,"['comparing', 'the', 'overall', 'outage', 'times', 'of']","['or', 'EC2', 'and', 'your', 'own']",158,ORG,0,159,38,0,comparing the overall outage times of,or EC2 and your own
261,"Install Visual Studio 2008 Sp1 on ""D"" Drive I am trying to install VS2008 sp1 to my work machine...",D,"['VS', 'allowed', 'me', 'to', 'install', 'to']","['originally', 'why', 'not', 'the', 'SP']",64,NORP,0,65,63,0,VS allowed me to install to,originally why not the SP
708,"Functional Development On The CLR If this has already been asked and answered, please point me t...",FP,"['I', ""'m"", 'doing', 'a', 'presentation', 'on']","['and', 'concurrency', 'for', 'some', 'other']",107,ORG,0,108,156,0,I'm doing a presentation on,and concurrency for some other
711,"Functional Development On The CLR If this has already been asked and answered, please point me t...",IronScheme,"['From', 'the', 'answers', 'below', ':', '\n']","['\n', 'hsdotNet', '\n', 'Scala', '(']",153,ORG,0,154,156,0,From the answers below:\n,\nhsdotNet\nScala (
725,Android Adverse To Dynamic Languages I believe I read at some point that due to Android running ...,FP,"[':', '\n\n', 'I', 'want', 'to', 'learn']","['\n', 'I', 'do', ""n't"", 'really']",160,ORG,0,161,157,0,: \n\nI want to learn,\nI don't really
752,Multiple Exits From F# Function I could do this easily in C++ (note: I didn't test this for corr...,FP,"['way', 'since', 'I', ""'m"", 'just', 'learning']","['?', '\n', 'Is', 'a', 'failwith']",102,ORG,0,103,167,0,way since I'm just learning,?\nIs a failwith
753,Multiple Exits From F# Function I could do this easily in C++ (note: I didn't test this for corr...,FP,"['ok', ':', '\n\n', 'Is', 'there', 'an']","['way', 'of', 'dealing', 'with', 'this']",131,ORG,0,132,167,0,ok:\n\nIs there an,way of dealing with this
761,How To Change List of Chars To String? In F# I want to transform a list of chars into a string. ...,abc,"['I', ""'m"", 'trying', 'to', 'get', '""']","['""', '.', ' ', 'I', 'realize']",56,ORG,0,57,170,0,"I'm trying to get """,""". I realize"
1029,Can I pass constructor parameters to Unity's Resolve() method? I am using Microsoft's Unity for ...,Unity,"['Can', 'I', 'pass', 'constructor', 'parameters', 'to']","[""'s"", 'Resolve', '(', ')', 'method']",6,ORG,0,7,248,0,Can I pass constructor parameters to,'s Resolve() method


In [99]:
from snorkel.labeling import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, None, n_epochs=5000, log_freq=500, seed=1337)

label_model





























LabelModel()

In [100]:
from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds

probs_dev = label_model.predict_proba(L_dev)
preds_dev = probs_to_preds(probs_dev)
print(
    f"Label model accuracy score: {metric_score(y_dev, preds_dev, probs=probs_dev, metric='accuracy')}"
)
print(
    f"Label model precision score: {metric_score(y_dev, preds_dev, probs=probs_dev, metric='precision')}"
)
print(
    f"Label model recall score: {metric_score(y_dev, preds_dev, probs=probs_dev, metric='recall')}"
)
print(
    f"Label model f1 score: {metric_score(y_dev, preds_dev, probs=probs_dev, metric='f1')}"
)
print(
    f"Label model roc-auc: {metric_score(y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}"
)

Label model accuracy score: 0.6348837209302326
Label model precision score: 0.24162257495590828
Label model recall score: 0.7696629213483146
Label model f1 score: 0.36778523489932885
Label model roc-auc: 0.8348026634871877


In [101]:
from snorkel.labeling import filter_unlabeled_dataframe

probs_train = label_model.predict_proba(L_train)
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

In [102]:
df_train_filtered.head(3)

Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label,left_text,right_text
8962,getting mvc-mini-profiler to show some data to all users I am using the excellent MVC Mini Profi...,MVC Mini Profiler,"[users, I, am, using, the, excellent]","[for, an, internal, project, ,]",18,ORG,0,19,1969,0,users I am using the excellent,"for an internal project,"
26490,C# .NET Garbage Collection not functioning? I am working on a relatively large solution in Visua...,120,"[memory, usage, stayed, at, 45, and]","[MB, respectively, for, 24 hours, (]",165,CARDINAL,0,166,6330,0,memory usage stayed at 45 and,MB respectively for 24 hours (
135145,String Replacement and Matching in Python 2 I have user posts that I would like to match up with...,4,"[thought, of, brute, forcing, it, with]","[for, loops, and, then, doing]",68,CARDINAL,0,69,30441,0,thought of brute forcing it with,for loops and then doing


In [149]:
from typing import Tuple
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import (
    Bidirectional,
    Concatenate,
    Dense,
    Embedding,
    Input,
    LSTM,
)


def get_feature_arrays(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    """Get np arrays of upto max_length tokens and person idxs."""
    left = df.left_tokens_text
    right = df.right_tokens_text

    def pad_or_truncate(l, max_length=40):
        pad_length = max_length - len(l)
        padding = [""] * pad_length
        left_values = l[:max_length]
        padded_values = np.append(left_values, padding)
        return padded_values

    left_tokens = np.array(list(map(pad_or_truncate, left)))
    # print('left_tokens', type(left_tokens), left_tokens)
    right_tokens = np.array(list(map(pad_or_truncate, right)))
    # print('right_tokens', type(right_tokens), right_tokens)
    return left_tokens, right_tokens


def bilstm(
    tokens: tf.Tensor,
    rnn_state_size: int = 64,
    num_buckets: int = 40000,
    embed_dim: int = 36,
):
    ids = tf.strings.to_hash_bucket(tokens, num_buckets)
    embedded_input = Embedding(num_buckets, embed_dim)(ids)
    return Bidirectional(LSTM(rnn_state_size, activation=tf.nn.relu))(
        embedded_input, mask=tf.strings.length(tokens)
    )


def get_model(
    rnn_state_size: int = 64, num_buckets: int = 40000, embed_dim: int = 12
) -> tf.keras.Model:
    """
    Return LSTM model for predicting label probabilities.
    Args:
        rnn_state_size: LSTM state size.
        num_buckets: Number of buckets to hash strings to integers.
        embed_dim: Size of token embeddings.
    Returns:
        model: A compiled LSTM model.
    """
    left_ph = Input((None,), dtype="string")
    right_ph = Input((None,), dtype="string")
    left_embs = bilstm(left_ph, rnn_state_size, num_buckets, embed_dim)
    right_embs = bilstm(right_ph, rnn_state_size, num_buckets, embed_dim)
    layer = Concatenate(1)([left_embs, right_embs])
    layer = Dense(64, activation=tf.nn.relu)(layer)
    layer = Dense(32, activation=tf.nn.relu)(layer)
    probabilities = Dense(2, activation=tf.nn.softmax)(layer)
    model = tf.keras.Model(inputs=[left_ph, right_ph], outputs=probabilities)
    model.compile(tf.compat.v1.train.AdagradOptimizer(0.1), "categorical_crossentropy")
    return model


Next, we train a simple [LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory) network for classifying candidates. `tf_model` contains functions for processing features and building the keras model for training and evaluation.

In [150]:
from utils import get_n_epochs

X_train = get_feature_arrays(df_train_filtered)
model = get_model()
batch_size = 64
model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=get_n_epochs())

Train on 72989 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fb88f098a10>

Finally, we evaluate the trained model by measuring its F1 score and ROC_AUC.

In [None]:
probs_test = label_model.predict_proba(L_test)
Y_test = probs_to_preds(probs_test)

X_test = get_feature_arrays(df_test)
probs_test = model.predict(X_test)
preds_test = probs_to_preds(probs_test)

print(
    f"Test F1 when trained with hard labels     : {metric_score(Y_test, preds=preds_test, metric='f1')}"
)
print(
    f"Test ROC-AUC when trained with soft labels: {metric_score(Y_test, probs=probs_test, metric='roc_auc')}"
)

X_gold = get_feature_arrays(df_gold)
probs_gold = model.predict(X_gold)
preds_gold = probs_to_preds(probs_gold)

print(
    f"Gold accuracy score: {metric_score(y_dev, preds_gold, probs=preds_gold, metric='accuracy')}"
)
print(
    f"Label model precision score: {metric_score(y_dev, preds_gold, probs=preds_gold, metric='precision')}"
)
print(
    f"Label model recall score: {metric_score(y_dev, preds_gold, probs=preds_gold, metric='recall')}"
)
print(
    f"Gold F1 when trained with hard labels     : {metric_score(y_dev, preds=preds_gold, metric='f1')}"
)
print(
    f"Gold ROC-AUC when trained with soft labels: {metric_score(y_dev, probs=probs_gold, metric='roc_auc')}"
)

In [152]:
# get_feature_arrays(df_gold.head(3))
df_gold['left_tokens_text'].iloc[0]

['optimal', 'level', 'of', 'concurrency', 'that', 'the']

In [153]:
# get_feature_arrays(df_test.head(3))
df_test['left_tokens_text'].iloc[0]
df_test.head(3)

Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label,left_text,right_text
81582,What's the best way to strip literal values out of SQL to correctly identify db workload? Does a...,SQL,"[can, strip, literal, values, out, of]","[statements, ?, \n, The, reason]",33,ORG,0,34,18532,0,can strip literal values out of,statements?\nThe reason
124962,"Specify Date/Time Field Format through SQL in MS Access In MS Access, I am requiring to create a...",MS Access,"[Format, through, SQL, in, MS Access, In]","[,, I, am, requiring, to]",11,ORG,0,12,28236,0,Format through SQL in MS Access In,", I am requiring to"
124808,"Visual C++ 2010: LNK1104, LNK1181 - .obj Files Will Not Auto-Generate I've recently downloaded '...",Microsoft,"[had, problems, with, Microsoft, Silverlight, (]","[,, of, course, !, )]",258,ORG,0,259,28207,0,had problems with Microsoft Silverlight (,", of course!)"


## Summary
In this tutorial, we showed how Snorkel can be used for Information Extraction. We demonstrated how to create LFs that leverage keywords and external knowledge bases (distant supervision). Finally, we showed how a model trained using the probabilistic outputs of the Label Model can achieve comparable performance while generalizing to all data points.