# Detecting spouse mentions in sentences

In [1]:
# Snorkel Introduction

from collections import OrderedDict 
from glob import glob
import os
import sys

import cupy
# import dask.dataframe as dd
import numpy as np
import pandas as pd
import pyarrow
import random
import snorkel
import spacy
import tensorflow as tf

# Add parent directory to path
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

# Make reproducible
random.seed(1337)

# Turn off TensorFlow logging messages
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# For reproducibility
os.environ["PYTHONHASHSEED"] = "1337"

In [2]:
TAG_LIMIT = 50

In [3]:
PATHS = {
    'questions': {
        'local': '../../data/stackoverflow/Questions.Tags.{}.parquet/part-00029-1ad544ea-abd4-4960-aa2c-7e0eb12cdb8e-c000.snappy.parquet',
        's3': 's3://stackoverflow-events/08-05-2019/Questions.Tags.{}.parquet',
    }
}

# Define a set of paths for each step for local and S3
PATH_SET = 'local' # 's3'

In [14]:
path = PATHS['questions'][PATH_SET].format(TAG_LIMIT)

df = pd.read_parquet(
    path, 
    engine='pyarrow',
    
)
df.head(3)

Unnamed: 0,_PostId,_AcceptedAnswerId,_Body,_Code,_Tags,_Label,_AnswerCount,_CommentCount,_FavoriteCount,_OwnerUserId,...,_AccountId,_UserId,_UserDisplayName,_UserDownVotes,_UserLocation,_ProfileImageUrl,_UserReputation,_UserUpVotes,_UserViews,_UserWebsiteUrl
0,264,,BerkeleyDB Concurrency \nWhat's the optimal le...,,"[c++, berkeley-db]",0,5,0,1.0,104,...,86,104,Ted Dziuba,4,California,,1600,9,2325,http://www.teddziuba.com/
1,1289124,1289185.0,Python equivalent of Jstack? Is there a python...,,[python],0,1,1,,104,...,86,104,Ted Dziuba,4,California,,1600,9,2325,http://www.teddziuba.com/
2,1545263,1545599.0,"UTF-8 In Python logging, how? I'm trying to lo...",import logging\n\ndef logging_test():\n han...,"[python, logging, unicode]",0,4,3,10.0,104,...,86,104,Ted Dziuba,4,California,,1600,9,2325,http://www.teddziuba.com/


In [5]:
# Enable GPU support
spacy.prefer_gpu()

# Download the spaCy english model
spacy.cli.download('en_core_web_lg')
nlp = spacy.load("en_core_web_lg")

from spacy.pipeline import merge_entities

nlp.add_pipe(merge_entities)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [7]:
from spacy import displacy

s = 'The program to do payroll was written in C++ and Perl.'
d = nlp(s)
tups = []
for t in d:
    tups.append((t.text, t.pos_))

# Print words/parts-of-speech
print([x for x in tups])

# Render image diagrams
displacy.render(d, style='dep', options={'compact': True, 'collapse_punct': True, 'distance': 90}, )
displacy.render(d, style='ent')

[('The', 'DET'), ('program', 'NOUN'), ('to', 'PART'), ('do', 'AUX'), ('payroll', 'NOUN'), ('was', 'AUX'), ('written', 'VERB'), ('in', 'ADP'), ('C++', 'PROPN'), ('and', 'CCONJ'), ('Perl', 'PROPN'), ('.', 'PUNCT')]


In [8]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

pattern = [{'POS': 'VERB'}, {'POS': 'ADP'}, {'POS': 'PROPN'}]
matcher.add("VERB_ADP_PROPN", None, pattern)

for d in df_sample['_SpacyDoc']:
    matches = matcher(d)

    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = d[start:end]  # The matched span
        for w in span:
            print(w.text, w.pos_, w.ent_type_)

work VERB 
in ADP 
Chrome PROPN PRODUCT
according VERB 
to ADP 
@ezequiel-jadib PROPN ORG
Compiling VERB 
to ADP 
32bit PROPN 
searching VERB 
with ADP 
Google PROPN ORG
knew VERB 
to ADP 
jQuery PROPN ORG
connect VERB 
to ADP 
amazon RDS PROPN ORG
login VERB 
from ADP 
localhost PROPN 
going VERB 
through ADP 
amazon AWS PROPN ORG
check VERB 
on ADP 
Ubuntu 16.04 PROPN LAW
noticed VERB 
by ADP 
Atlassian Bitbucket Team PROPN ORG
listen VERB 
to ADP 
CONNECTIVITY_ACTION PROPN 
populated VERB 
from ADP 
UI PROPN ORG
work VERB 
on ADP 
Eclipse PROPN PRODUCT
Eclipse VERB PRODUCT
through ADP 
VNC PROPN ORG
mentioned VERB 
in ADP 
JavaDoc PROPN 
Turn VERB 
off ADP 
SQLCMD PROPN 
stored VERB 
in ADP 
Windows PROPN 
use VERB 
in ADP 
FaceBook PROPN ORG
Looking VERB 
at ADP 
Async PROPN 
string VERB 
in ADP 
GLib PROPN 
go VERB 
through ADP 
Apache PROPN PRODUCT
created VERB 
by ADP 
jquery PROPN ORG
working VERB 
with ADP 
Boost::Python PROPN 
know VERB 
about ADP 
JNI PROPN 
translated VERB 

Use VERB 
e.g. ADP 
Amazon PROPN ORG
went VERB 
with ADP 
Daniel PROPN PERSON
scroll VERB 
up ADP 
CollapsingToolbar PROPN 
want VERB 
for ADP 
angularjs PROPN PRODUCT
Expired VERB 
in ADP 
NHibernate PROPN 
located VERB 
in ADP 
shm PROPN GPE
show VERB 
on ADP 
Bootstrap PROPN 
working VERB 
on ADP 
Windows 10 PROPN PRODUCT
fopen VERB 
in ADP 
PHP PROPN 
pasted VERB 
in ADP 
MS PROPN 
working VERB 
on ADP 
UIKit PROPN ORG
work VERB 
on ADP 
Swift PROPN ORG
going VERB 
through ADP 
Singleton PROPN 
converting VERB 
to ADP 
VARCHAR2 PROPN 
runs VERB 
in ADP 
Oracle PROPN LAW
login VERB 
via ADP 
ACS PROPN ORG
executed VERB 
BEFORE ADP 
RenderTemplate PROPN 
according VERB 
to ADP 
MSDN PROPN ORG
find VERB 
via ADP 
google PROPN 
written VERB 
in ADP 
java PROPN LOC
pointing VERB 
to ADP 
Line PROPN 
provided VERB 
by ADP 
JDL PROPN ORG
should VERB 
in ADP 
Firefox PROPN ORG
provided VERB 
by ADP 
Debian Wheezy PROPN PERSON
changed VERB 
in ADP 
Perl PROPN ORG
turned VERB 
off ADP 
Lavas

works VERB 
on ADP 
Github PROPN ORG
required VERB 
in ADP 
PHP PROPN ORG
happens VERB 
in ADP 
Chrome PROPN 
sent VERB 
from ADP 
Google PROPN ORG
defined VERB 
in ADP 
python PROPN 
working VERB 
in ADP 
python PROPN 
done VERB 
in ADP 
TortoiseGit PROPN ORG
test VERB 
out ADP 
Arrays vs Dictionaries
 
 PROPN PRODUCT
passed VERB 
to ADP 
MVC Action PROPN ORG
passed VERB 
via ADP 
jQuery Ajax PROPN ORG
experimenting VERB 
with ADP 
Google Tag PROPN ORG
sent VERB 
by ADP 
ga PROPN ORG
going VERB 
to ADP 
Project PROPN 
update VERB 
to ADP 
1.3.0 PROPN CARDINAL
done VERB 
with ADP 
tinymce PROPN ORG
based VERB 
on ADP 
WPFDesigner_XML PROPN 
switch VERB 
to ADP 
Activity PROPN 
built VERB 
in ADP 
Java PROPN ORG
redirect VERB 
on ADP 
OnAuthentication PROPN 
connect VERB 
to ADP 
db PROPN 
built VERB 
in ADP 
Ruby PROPN GPE
found VERB 
in ADP 
Table1 PROPN ORG
Delete VERB 
in ADP 
Ruby PROPN PERSON
based VERB 
on ADP 
column_A PROPN 
written VERB 
in ADP 
PHP PROPN ORG
comes VERB 
with 

clean VERB 
on ADP 
Monday PROPN DATE
Insert VERB 
INTO ADP 
MySQL PROPN 
cancelled VERB 
in ADP 
iOS PROPN ORG
working VERB 
with ADP 
ux PROPN ORG
focussed VERB 
on ADP 
javascript PROPN ORG
According VERB 
to ADP 
JDO PROPN ORG
showing VERB 
in ADP 
Asp PROPN ORG
routing VERB 
in ADP 
Angular 2 PROPN LAW
logging VERB 
in ADP 
python PROPN 
cast VERB 
to ADP 
java.lang PROPN 
cast VERB 
to ADP 
java.lang PROPN 
provided VERB 
by ADP 
developer.android.com PROPN 
shown VERB 
in ADP 
AppStore PROPN ORG
optimised VERB 
for ADP 
iPhone PROPN ORG
running VERB 
against ADP 
nginx PROPN 
talking VERB 
to ADP 
MS PROPN 
speeding VERB 
up ADP 
NetSuite PROPN ORG
created VERB 
in ADP 
Django PROPN GPE
appear VERB 
in ADP 
Gnome Shell PROPN PRODUCT
looked VERB 
at ADP 
Gtk PROPN FAC
starting VERB 
in ADP 
Gtk+3.3 PROPN 
deprecated VERB 
in ADP 
April 2015 PROPN DATE
migrate VERB 
to ADP 
OpenID PROPN 
provided VERB 
by ADP 
OpenID PROPN ORG
needed VERB 
to ADP 
Google PROPN ORG
parse VERB 
with

in ADP 
Android NDK PROPN PERSON
draws VERB 
from ADP 
Doug Glancy's PROPN PERSON
reading VERB 
about OpenId ADP CARDINAL
Connect PROPN 
navigates VERB 
to ADP 
Facebook PROPN ORG
implemented VERB 
by ADP 
Facebook PROPN ORG
Autoboxing VERB 
in ADP 
Java PROPN ORG
used VERB 
in ADP 
JSF PROPN ORG
learning VERB 
in ADP 
python PROPN 
search VERB 
on ADP 
google PROPN ORG
take VERB 
for ADP 
Oracle Portal PROPN ORG
working VERB 
with ADP 
Grails 3 PROPN PRODUCT
experienced VERB 
with ADP 
ASP.NET PROPN ORG
query VERB 
in ADP 
GQL PROPN 
looking VERB 
at ADP 
Apple PROPN ORG
programming VERB 
in ADP 
C++ PROPN PRODUCT
program VERB 
against ADP 
Win32 PROPN PRODUCT
dispatched VERB 
with ADP 
redux PROPN 
Registered VERB 
in ADP 
Gac PROPN ORG
registered VERB 
in ADP 
GAC PROPN ORG
like VERB 
in ADP 
Java PROPN ORG
required VERB 
from ADP 
Signer PROPN 
opened VERB 
in ADP 
PDF PROPN 
entered VERB 
into ADP 
AppA. PROPN 
working VERB 
in ADP 
VB.NET PROPN PRODUCT
works VERB 
with ADP 
Visua

provided VERB 
on ADP 
Umbraco PROPN ORG
Bound VERB 
in ADP 
F PROPN 
searching VERB 
by ADP 
ObjectId PROPN 
based VERB 
on ADP 
Parallax PROPN ORG
written VERB 
by ADP 
Erlang PROPN ORG
running VERB 
on ADP 
OTP 17.3 PROPN PERSON
having VERB 
with ADP 
Hex PROPN ORG
code VERB 
in ADP 
C PROPN 
converted VERB 
to ADP 
QString PROPN ORG
run VERB 
with ADP 
FlashBuilder PROPN ORG
embedded VERB 
in ADP 
a.jar PROPN 
written VERB 
in ADP 
underscore.js PROPN 
worked VERB 
with ADP 
PHP PROPN 
inspired VERB 
by ADP 
Java PROPN ORG
calling VERB 
from ADP 
iPhone PROPN ORG
try VERB 
to ADP 
getInventoryEntriesByISBN PROPN 
occurs VERB 
at ADP 
SearchResultsScreen PROPN FAC
occurs VERB 
at ADP 
InventoryAdapter PROPN ORG
executed VERB 
through ADP 
Java PROPN ORG
executed VERB 
through ADP 
Java PROPN ORG
seen VERB 
on ADP 
JQuery UI's PROPN ORG
work VERB 
in ADP 
Windows PROPN 
move VERB 
to ADP 
ViewController PROPN PRODUCT
connect VERB 
to ADP 
LDAP PROPN ORG
looking VERB 
into ADP 
Paxos 

In [9]:
d = df_sample['_SpacyDoc'].iloc[1]
print(d.ents)
e = d.ents[0]
len(e.text.split(' '))

(IntelliJ, API, Firefox)


1

In [None]:
# from sklearn.model_selection import train_test_split

# labels = np.zeros((cand_df.shape[0]))

# df_train, df_test, y_train, y_test = train_test_split(
#     new_cand_df, 
#     np.array(labels), 
#     test_size=0.3,
#     random_state=1337,
# )

# df_test, df_dev, y_test, y_dev = train_test_split(
#     df_test_dev,
#     y_test_dev,
#     test_size=0.01,
#     random_state=1337,
# )

# df_train = df_train.sort_index()
# df_test  = df_test.sort_index()
# df_dev   = df_dev.sort_index()

# df_dev = pd.read_csv('../../data/language.extractor.gold.2.csv')

# df_dev['df_index'] = df_dev[df_dev.columns[0]]
# df_dev = df_dev.set_index('df_index')

# df_dev['_SpacyDoc'] = df_dev['body'].apply(lambda x: nlp(x))

# Produce the records in the demo for all entities we detect
window = 5
candidates = []
for index, row in df.iterrows():
    doc = nlp(row['_Body'])
    re_doc_1 = nlp(row['body'])
    re_doc_2 = nlp(row['body'])
    
    for ent in doc.ents:
        rec = {}
        rec['body'] = doc.text
        rec['entity'] = ent
        rec['entity_text'] = ent.text
        rec['entity_start'] = ent.start
        rec['entity_end'] = ent.end
        rec['ent_type'] = ent.label_

        left_token_start = max(0, ent.start - 1 - window)
        left_token_end = ent.start
        rec['left_tokens_text'] = [x.text for x in doc[left_token_start : left_token_end]]
        rec['left_text'] = re_doc_1[left_token_start : left_token_end].merge()

        right_token_start = min(ent.end, len(doc) - 1)
        right_token_end = min(ent.end + window, len(doc) - 1)
        rec['right_tokens_text'] = [x.text for x in doc[right_token_start : right_token_end]]
        rec['right_text'] = re_doc_2[right_token_start : right_token_end].merge()

        rec['wikidata_id'] = ent.kb_id
        
        rec['original_index'] = index
        rec['label'] = 0

        candidates.append(rec)

df_out = pd.DataFrame(candidates)
df_out = df_out.reindex().sort_index()

df_out.head()

In [21]:
# df_out['entity_start'] = df_out['entity_idx'].apply(lambda x: x[0])
# df_out['entity_end'] = df_out['entity_idx'].apply(lambda x: x[1])
# df_out = df_out.drop(['spacy', 'entity', 'entity_idx'], axis=1)
# df_out['label'] = 0

In [24]:
df_out.to_parquet(
    '../../data/text_extractions.one_file.df_out.parquet',
    engine='pyarrow'
)

In [23]:
df_out.to_csv('../../data/text_extractions.one_file.df_out.csv')

In [31]:
df_gold = pd.read_csv('../../data/text_extractions.one_file.df_out.gold.labeled.csv')
df_gold.tail()

Unnamed: 0.1,Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label
1285,1285,What features are supported by Android's Googl...,API,"[""'s"", 'Google', 'accounts', 'authenticator', ...","['documentation', 'for', 'the', ' ', 'method']",12,ORG,0,13,305,0
1286,1286,What features are supported by Android's Googl...,Android,"['documentation', 'for', 'the', ' ', 'method',...","[""'s"", ' ', 'has', 'the', 'following']",19,ORG,0,20,305,0
1287,1287,What features are supported by Android's Googl...,Google,"['are', 'used', 'to', 'tell', '\n ', 'whether']","['accounts', 'have', 'a', 'particular', 'servi...",61,ORG,0,62,305,0
1288,1288,What features are supported by Android's Googl...,Google,"['a', 'particular', 'service', '(', 'such', 'as']","['\n ', 'Calendar', 'or', 'Google Talk', ')']",70,ORG,0,71,305,0
1289,1289,What features are supported by Android's Googl...,Google Talk,"['such', 'as', 'Google', '\n ', 'Calendar', '...","[')', 'enabled', '.', 'The', 'feature']",74,ORG,0,75,305,0


In [32]:
df_in = df_out.iloc[df_gold.index[-1] + 1:, :]
df_in.head()

Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index
1290,What features are supported by Android's Googl...,Google,"[is, for, the, authenticator, used, for]","[accounts, ?, , I, would]",118,ORG,0,119,305
1291,Can't find an option to set a view colour usin...,RGB,"[to, set, a, view, colour, using]","[or, hex, in, XCode, 4.2]",11,ORG,0,12,306
1292,Can't find an option to set a view colour usin...,XCode,"[colour, using, RGB, or, hex, in]","[4.2, In, XCode, 4.2, ,]",15,ORG,0,16,306
1293,Can't find an option to set a view colour usin...,4.2,"[using, RGB, or, hex, in, XCode]","[In, XCode, 4.2, ,, I]",16,CARDINAL,0,17,306
1294,Can't find an option to set a view colour usin...,XCode,"[or, hex, in, XCode, 4.2, In]","[4.2, ,, I, want, to]",18,ORG,0,19,306


In [None]:
def restore_spacy(df, window=5):
    
    indexes = []
    out_rows = []
    for index, row in df.iterrows():
        doc = nlp(row['body'])
        re_doc_1 = nlp(row['body'])
        re_doc_2 = nlp(row['body'])
        
        out_row = row.copy()

        entity = None
        for ent in doc.ents:
            if  ent.start == row['entity_start'] \
            and ent.end   == row['entity_end']:
                entity = ent

        if entity is None:
            raise Exception('Missing entity!')

        out_row['spacy'] = doc
        out_row['entity'] = entity
        
        # Comment me out once I do this in the above code
        left_token_start = max(0, entity.start - 1 - window)
        left_token_end = entity.start
        out_row['left_text'] = re_doc_1[left_token_start: left_token_end].merge()

        # Comment me out once I do this in the above code
        right_token_start = min(entity.end, len(doc) - 1)
        right_token_end = min(entity.end + window, len(doc) - 1)
        out_row['right_text'] = re_doc_2[right_token_start: right_token_end].merge()
        
        out_rows.append(out_row)
        indexes.append(index)
    
    df_out = pd.DataFrame(out_rows, index=indexes)
    
    return df_out

In [67]:
df_gold = restore_spacy(df_gold)
df_gold.head()

Unnamed: 0.1,Unnamed: 0,body,entity_text,left_tokens_text,right_tokens_text,entity_start,ent_type,wikidata_id,entity_end,original_index,label,spacy,entity,left_text,right_text
0,0,BerkeleyDB Concurrency \nWhat's the optimal le...,C++,"['optimal', 'level', 'of', 'concurrency', 'tha...","['implementation', 'of', 'BerkeleyDB', 'can', ...",12,LANGUAGE,0,13,0,1,"(BerkeleyDB, Concurrency, \n, What, 's, the, o...",(C++),optimal level of concurrency that the,implementation of BerkeleyDB can reasonably
1,1,Python equivalent of Jstack? Is there a python...,Jstack,"['Python', 'equivalent', 'of']","['?', 'Is', 'there', 'a', 'python']",3,PERSON,0,4,1,0,"(Python, equivalent, of, Jstack, ?, Is, there,...",(Jstack),Python equivalent of,? Is there a python
2,2,"UTF-8 In Python logging, how? I'm trying to lo...",Python,"['encoded', 'string', 'to', 'a', 'file', 'using']","[""'s"", 'logging', 'package', '.', ' ']",20,ORG,0,21,2,1,"(UTF-8, In, Python, logging, ,, how, ?, I, 'm,...",(Python),encoded string to a file using,'s logging package.
3,3,"UTF-8 In Python logging, how? I'm trying to lo...",Python,"['\n', 'At', 'a', 'lower', 'level', ',']","[""'s"", 'logging', 'package', 'is', 'using']",49,ORG,0,50,2,1,"(UTF-8, In, Python, logging, ,, how, ?, I, 'm,...",(Python),"\nAt a lower level,",'s logging package is using
4,4,"UTF-8 In Python logging, how? I'm trying to lo...",Python,"['which', 'explodes', '.', ' ', 'Essentially',...","['is', 'doing', 'this', ':', '\n\n']",104,ORG,0,105,2,1,"(UTF-8, In, Python, logging, ,, how, ?, I, 'm,...",(Python),"which explodes. Essentially,",is doing this:\n\n


In [70]:
print(df_test.iloc[100]['body'])
print(df_test.iloc[100]['right_tokens_text'])

df_test.iloc[100]

How to enable IBM Websphere Application Management Service? I try to use the IBM Websphere  (and the Ant tasks) to install/update an application EAR on a remote server. You may want to read this question too.
Manual process
I open a jython console with this command line:

After that I want to list all applications:

I think the message here is clear: The application management service is not running.
How to enable the Application Management Service?
I did search for documentation on the horrible, horrible IBM website. I also tried to click through the configuration options on the Websphere admin pages. But I can't find anything remotely close to application management service. I do that clicking again with english language settings now, but I'd appreciate if someone can point me to the configuration option or the documentation.

['language', 'settings', 'now', ',', 'but']


Unnamed: 0                                                         100
body                 How to enable IBM Websphere Application Manage...
entity_text                                                    english
left_tokens_text      ['I', 'do', 'that', 'clicking', 'again', 'with']
right_tokens_text          ['language', 'settings', 'now', ',', 'but']
entity_start                                                       139
ent_type                                                      LANGUAGE
wikidata_id                                                          0
entity_end                                                         140
original_index                                                      30
label                                                                0
spacy                (How, to, enable, IBM, Websphere, Application,...
entity                                                       (english)
left_text                                I do that clicking again with
right_

In [None]:
df_in_fixed = restore_spacy(df_in)
df_in_fixed.head()

## In this tutorial, we will see how Snorkel can be used for Information Extraction. We will walk through an example text classification task for information extraction, where we use labeling functions involving keywords and distant supervision.

### Classification Task
<img src="imgs/sentence.jpg" width="700px;" onerror="this.onerror=null; this.src='/doks-theme/assets/images/sentence.jpg';" align="center" style="display: block; margin-left: auto; margin-right: auto;">

We want to classify each __candidate__ or pair of people mentioned in a sentence, as being married at some point or not.

In the above example, our candidate represents the possible relation `(Barack Obama, Michelle Obama)`. As readers, we know this mention is true due to external knowledge and the keyword of `wedding` occuring later in the sentence.
We begin with some basic setup and data downloading.


In [82]:
# %matplotlib inline

# import os
# import pandas as pd
# import pickle

# if os.path.basename(os.getcwd()) == "snorkel-tutorials":
#     os.chdir("spouse")

In [83]:
# from utils import load_data

# ((tf_dev, ty_dev), tf_train, (tf_test, ty_test)) = load_data()

In [13]:
import pickle

test_data = pickle.load(open('data/dev_data.pkl', 'rb'))
test_data.head()

test_data

Unnamed: 0,person1_word_idx,person2_word_idx,sentence,tokens,person1_right_tokens,person2_right_tokens,between_tokens
0,"(1, 1)","(22, 24)","The Richards are half-sisters to Kathy Hilton,...","[The, Richards, are, half, -, sisters, to, Kat...","[are, half, -, sisters]","[., ]","[are, half, -, sisters, to, Kathy, Hilton, ,, ..."
1,"(1, 1)","(7, 8)","The Richards are half-sisters to Kathy Hilton,...","[The, Richards, are, half, -, sisters, to, Kat...","[are, half, -, sisters]","[,, the, mother, of]","[are, half, -, sisters, to]"
2,"(7, 8)","(22, 24)","The Richards are half-sisters to Kathy Hilton,...","[The, Richards, are, half, -, sisters, to, Kat...","[,, the, mother, of]","[., ]","[,, the, mother, of, socialite, Paris, Hilton,..."
3,"(6, 6)","(20, 21)","Prior to both his guests, Colbert's monologue ...","[Prior, to, both, his, guests, ,, Colbert, s, ...","[s, monologue, -, parts]","[and, his, oft, -]","[s, monologue, -, parts, of, which, he, did, s..."
4,"(2, 2)","(4, 5)",People reported Williams and Ven Veen tied the...,"[People, reported, Williams, and, Ven, Veen, t...","[and, Ven, Veen, tied]","[tied, the, knot, Saturday]",[and]


**Input Data:** `df_dev`, `df_train`, and `df_test` are `Pandas DataFrame` objects, where each row represents a particular __candidate__. For our problem, a candidate consists of a sentence, and two people mentioned in the sentence. The DataFrames contain the fields `sentence`, which refers to the sentence of the candidate, `tokens`, the tokenized form of the sentence, and `person1_word_idx` and `person2_word_idx`, which represent `[start, end]` indices in the tokens at which the first and second person's name appear, respectively.

We also have certain **preprocessed fields**, that we discuss a few cells below.

In [85]:
# Don't truncate text fields in the display
pd.set_option("display.max_colwidth", 100)

df_dev.head(30)

Unnamed: 0,body,spacy,entity,entity_text,left_tokens,left_tokens_text,right_tokens,right_tokens_text,start,ent_type,wikidata_id,end,label
200,Where is the defining JAR of the portlet taglibs in IBM Websphere Portal 7? I'm trying to build ...,"(Where, is, the, defining, JAR, of, the, portlet, taglibs, in, IBM, Websphere Portal, 7, ?, I, '...","(IBM, Websphere Portal)",IBM Websphere Portal,"(JAR, of, the, portlet, taglibs, in)","[JAR, of, the, portlet, taglibs, in]","(7, ?, I, 'm, trying)","[7, ?, I, 'm, trying]",10,ORG,0,12,0
200,Where is the defining JAR of the portlet taglibs in IBM Websphere Portal 7? I'm trying to build ...,"(Where, is, the, defining, JAR, of, the, portlet, taglibs, in, IBM, Websphere Portal, 7, ?, I, '...",(websphere),websphere,"('m, trying, to, build, portlets, for)","['m, trying, to, build, portlets, for]","(in, Eclipse, Juno, ., Everything)","[in, Eclipse, Juno, ., Everything]",21,PRODUCT,0,22,0
200,Where is the defining JAR of the portlet taglibs in IBM Websphere Portal 7? I'm trying to build ...,"(Where, is, the, defining, JAR, of, the, portlet, taglibs, in, IBM, Websphere Portal, 7, ?, I, '...",(Juno),Juno,"(build, portlets, for, websphere, in, Eclipse)","[build, portlets, for, websphere, in, Eclipse]","(., Everything, works, so, far)","[., Everything, works, so, far]",24,PRODUCT,0,25,0
200,Where is the defining JAR of the portlet taglibs in IBM Websphere Portal 7? I'm trying to build ...,"(Where, is, the, defining, JAR, of, the, portlet, taglibs, in, IBM, Websphere Portal, 7, ?, I, '...",(JSP),JSP,"(ok, ., \n, But, in, my)","[ok, ., \n, But, in, my]","(editor, I, get, a, lot)","[editor, I, get, a, lot]",46,ORG,0,47,0
200,Where is the defining JAR of the portlet taglibs in IBM Websphere Portal 7? I'm trying to build ...,"(Where, is, the, defining, JAR, of, the, portlet, taglibs, in, IBM, Websphere Portal, 7, ?, I, '...",(JSP),JSP,"(of, warnings, :, \n\n, In, my)","[of, warnings, :, \n\n, In, my]","(file, I, 'm, using, the)","[file, I, 'm, using, the]",58,ORG,0,59,0
328,Checking File is Open in Delphi Is there a way to check if a file has been opened by ReWrite in ...,"(Checking, File, is, Open, in, Delphi, Is, there, a, way, to, check, if, a, file, has, been, ope...",(Delphi),Delphi,"(Checking, File, is, Open, in)","[Checking, File, is, Open, in]","(Is, there, a, way, to)","[Is, there, a, way, to]",5,ORG,0,6,1
328,Checking File is Open in Delphi Is there a way to check if a file has been opened by ReWrite in ...,"(Checking, File, is, Open, in, Delphi, Is, there, a, way, to, check, if, a, file, has, been, ope...",(ReWrite),ReWrite,"(a, file, has, been, opened, by)","[a, file, has, been, opened, by]","(in, Delphi, ?, \n, Code)","[in, Delphi, ?, \n, Code]",19,ORG,0,20,1
328,Checking File is Open in Delphi Is there a way to check if a file has been opened by ReWrite in ...,"(Checking, File, is, Open, in, Delphi, Is, there, a, way, to, check, if, a, file, has, been, ope...",(Delphi),Delphi,"(has, been, opened, by, ReWrite, in)","[has, been, opened, by, ReWrite, in]","(?, \n, Code, would, go)","[?, \n, Code, would, go]",21,ORG,0,22,1
871,How can I use Git with multiple remote repositories? I use currently use Heroku for rails hostin...,"(How, can, I, use, Git, with, multiple, remote, repositories, ?, I, use, currently, use, Heroku,...",(Heroku),Heroku,"(repositories, ?, I, use, currently, use)","[repositories, ?, I, use, currently, use]","(for, rails, hosting, which, uses)","[for, rails, hosting, which, uses]",14,ORG,0,15,0
871,How can I use Git with multiple remote repositories? I use currently use Heroku for rails hostin...,"(How, can, I, use, Git, with, multiple, remote, repositories, ?, I, use, currently, use, Heroku,...",(1),1,"( , I, would, like, to, have)","[ , I, would, like, to, have]","(local, folder, that, has, my)","[local, folder, that, has, my]",54,CARDINAL,0,55,0


Let's look at a candidate in the development set:

In [86]:
# from preprocessors import get_person_text

# candidate = tf_dev.loc[2]
# person_names = get_person_text(candidate).person_names

# print("Sentence: ", candidate["sentence"])
# print("Person 1: ", person_names[0])
# print("Person 2: ", person_names[1])

### Preprocessing the Data

In a real application, there is a lot of data preparation, parsing, and database loading that needs to be completed before we generate candidates and dive into writing labeling functions. Here we've pre-generated candidates in a pandas DataFrame object per split (train,dev,test).

### Labeling Function Helpers

When writing labeling functions, there are several functions you will use over and over again. In the case of text relation extraction as with this task, common functions include those for fetching text between mentions of the two people in a candidate, examing word windows around person mentions, and so on. We will wrap these functions as `preprocessors`.

In [87]:
# from snorkel.preprocess import preprocessor


# @preprocessor()
# def get_text_between(cand):
#     """
#     Returns the text between the two person mentions in the sentence for a candidate
#     """
#     start = cand.person1_word_idx[1] + 1
#     end = cand.person2_word_idx[0]
#     cand.text_between = " ".join(cand.tokens[start:end])
#     return cand

### Candidate PreProcessors

For the purposes of the tutorial, we have three fields (`between_tokens`, `person1_right_tokens`, `person2_right_tokens`) preprocessed in the data, which can be used when creating labeling functions. We also provide the following set of `preprocessor`s for this task in `preprocessors.py`, along with the fields these populate.
* `get_person_text(cand)`: `person_names`
* `get_person_lastnames(cand)`: `person_lastnames`
* `get_left_tokens(cand)`: `person1_left_tokens`, `person2_left_tokens`

In [88]:
# Labels for language extraction

POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1

In [101]:
import re
import jsonlines, sys
from snorkel.labeling import labeling_function, LabelingFunction

# Label functions using distant supervision from SPARQL/WikiData for programming languages
languages, lower_languages = None, None
with jsonlines.open('../../data/programming_languages.jsonl', mode='r') as reader:
    languages = [x['name'] for x in reader]
    lower_languages = [x.lower() for x in languages]

@labeling_function(resources=dict(languages=languages))
def lf_matches_wikidata_langs(x, languages):
    """POSITIVE if the entity_text matches any language in list"""
    return POSITIVE if x.entity_text in languages else ABSTAIN

@labeling_function(resources=dict(lower_languages=lower_languages))
def lf_lower_matches_wikidata_langs(x, lower_languages):
    """POSITIVE if the lowercase entity_text matches any lowercase language in list"""
    return POSITIVE if x.entity_text.lower() in lower_languages else ABSTAIN

# Label functions using distant supervision from SPARQL/WikiData for operating systems
oses, os_parts = [], []
with jsonlines.open('../../data/operating_systems.jsonl', mode='r') as reader:
    oses = [x['name'].lower() for x in reader]
    for os in oses:
        for os_part in os.split():
            os_parts.append(os_part)

@labeling_function(resources=dict(oses=oses))
def lf_matches_wikidata_os(x, oses):
    """NEGATIVE if the lowercase entity_text matches any lowercase OS in the list"""
    return NEGATIVE if x.entity_text.lower() in oses else ABSTAIN

@labeling_function(resources=dict(os_parts=os_parts))
def lf_matches_wikidata_os_parts(x, os_parts):
    """NEGATIVE if the lowercase entity_text matches any lowercase OS fragment in the list"""
    return NEGATIVE if x.entity_text.lower() in os_parts else ABSTAIN

@labeling_function()
def lf_left_contains_language(x):
    """POSITIVE if 'language' appears left of the entity"""
    return POSITIVE if 'language' in x['left_tokens_text'] else ABSTAIN

@labeling_function()
def lf_right_contains_language(x):
    """POSITIVE if 'language' appears right of the entity"""
    return POSITIVE if 'language' in x['right_tokens_text'] else ABSTAIN

@labeling_function()
def lf_is_framework(x):
    """NEGATIVE if 'framework' appears to right or left of entity"""
    return NEGATIVE if 'framework' in [y.lower() for y in x['left_tokens_text']] or \
                       'framework' in [y.lower() for y in x['right_tokens_text']] else ABSTAIN

starts_rx = re.compile('^\W')
          
@labeling_function()
def lf_starts_with_char(x):
    """NEGATIVE if starts with a '-'"""
    return NEGATIVE if starts_rx.match(x['entity_text']) else ABSTAIN

@labeling_function()
def lf_wrong_entity_type(x):
    return NEGATIVE if x['ent_type'] in ['PERSON', 'NORP', 'FAC', 'GPE', 'LOC', 
                                         'LAW', 'DATE', 'TIME', 'PERCENT',
                                         'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL',] else ABSTAIN

@labeling_function()
def lf_token_count(x):
    return NEGATIVE if len(x['entity_text'].split(' ')) > 2 else ABSTAIN

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{'POS': 'VERB'}, {'POS': 'ADP'}, {'POS': 'PROPN'}]
matcher.add("VERB_ADP_PROPN", None, pattern)

@labeling_function()
def lf_verb_in_noun(x):
    """Return positive if the pattern"""
    sp = x['spacy']
    matches = matcher(sp)
    
    found = False
    for match_id, start, end in matches:
        if start == x['start'] - 2:            
            if sp[start].text in ['work', 'written', 'wrote']:                
                if sp[start + 1].text in ['in']:
                    return POSITIVE
    else:
        return ABSTAIN

prefixes = ['internet', 'ie', 'firefox', 'google', 'chrome', 'apple', 'safari', 'webkit', 'gecko', 
            'opera', 'netscape', 'chromium', ]
browser_rx = re.compile(''.join(['^(?:', '|'.join(prefixes), ')']))

@labeling_function()
def lf_not_browser(x):
    """Eliminate browser false positives"""
    e = x['entity_text'].lower()
    return NEGATIVE if browser_rx.match(e) else ABSTAIN

@labeling_function()
def lf_not_operating_system(x):
    """Eliminate OS false positives"""

# Make keyword LF generation
def keyword_lookup(x, keywords, field, label):
    """Perform lowercase matching for keyword LFs"""
    match = any(word.lower() in x[field].lower() for word in keywords)
    if match:
        return label
    return ABSTAIN

def make_keyword_lf(keywords, field='body', label=ABSTAIN):
    """Given keywords, a field to match against and a label to return, return an keyword LF"""
    return LabelingFunction(
        name=f"keyword_{keywords}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, field=field, label=label),
    )

# Define keyword LFs
language_keyword_lf = make_keyword_lf(['language'], 'left_text', label=POSITIVE)
written_keyword_lf = make_keyword_lf(['written'], 'left_text', label=POSITIVE)
framework_keyword_lf = make_keyword_lf(['framework', 'package'], 'right_text', label=NEGATIVE)

# For each keyword, split on hyphen and create an LF that detects if that tag is present in the data
keyword_lfs = OrderedDict()
for language in languages:    
    keyword_lfs[language] = make_keyword_lf([language], label=POSITIVE)

from snorkel.labeling import PandasLFApplier

lfs = [
    lf_matches_wikidata_langs,
    lf_lower_matches_wikidata_langs,
    lf_left_contains_language,
    # lf_right_contains_language,
    lf_is_framework,
    lf_starts_with_char,
    lf_wrong_entity_type,
    lf_token_count,
    lf_verb_in_noun,
    language_keyword_lf,
    written_keyword_lf,
    lf_not_browser,
] # + list(keyword_lfs.values())
applier = PandasLFApplier(lfs)

from snorkel.labeling import LFAnalysis

L_dev = applier.apply(df_dev)
L_train = applier.apply(df_train)

LFAnalysis(L_dev, lfs).lf_summary(y_dev)

  from pandas import Panel




  0%|          | 0/4702 [00:00<?, ?it/s][A[A[A[A

KeyError: ('left_text', 'occurred at index 200')

In [None]:
(L_train != ABSTAIN).mean(axis=0)

In [None]:
from snorkel.analysis import get_label_buckets

buckets = get_label_buckets(y_dev, L_dev[:, 1])

df_dev.iloc[buckets[NEGATIVE, POSITIVE]]

# df_dev.iloc[]

In [None]:
from snorkel.labeling import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, None, n_epochs=5000, log_freq=500, seed=1337)

label_model

In [None]:
from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds

probs_dev = label_model.predict_proba(L_dev)
preds_dev = probs_to_preds(probs_dev)
print(
    f"Label model f1 score: {metric_score(y_dev, preds_dev, probs=probs_dev, metric='f1')}"
)
print(
    f"Label model roc-auc: {metric_score(y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}"
)

In [None]:
from snorkel.labeling import filter_unlabeled_dataframe

probs_train = label_model.predict_proba(L_train)
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

In [None]:
df_train_filtered

In [None]:
def re_tokenize(row, window=5):
    doc = row['spacy']
    with doc.retokenize() as retokenizer:
        
        l = max(row['start'] - 1 - window, 0)
        print(l, row['end'])
        row['left_text'] = retokenizer.merge(doc[l : row['start']])
#         l = max(row['end'] + 1, 0)
#         r = min(row['end'] + 1 + window, len(doc) - 1)
        #row['right_text'] = retokenizer.merge(doc[l : r])
        return row

tf_train = df_train_filtered.apply(re_tokenize, axis=1)
tf_train

In [None]:
from typing import Tuple
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import (
    Bidirectional,
    Concatenate,
    Dense,
    Embedding,
    Input,
    LSTM,
)


def get_feature_arrays(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """Get np arrays of upto max_length tokens and person idxs."""
    bet = df.between_tokens
    left = df.apply(lambda c: c.tokens[: c.person1_word_idx[0]][-4:-1], axis=1)
    right = df.person2_right_tokens

    def pad_or_truncate(l, max_length=40):
        return l[:max_length] + [""] * (max_length - len(l))

    left_tokens = np.array(list(map(pad_or_truncate, left)))
    bet_tokens = np.array(list(map(pad_or_truncate, bet)))
    right_tokens = np.array(list(map(pad_or_truncate, right)))
    return left_tokens, bet_tokens, right_tokens


def bilstm(
    tokens: tf.Tensor,
    rnn_state_size: int = 64,
    num_buckets: int = 40000,
    embed_dim: int = 36,
):
    ids = tf.strings.to_hash_bucket(tokens, num_buckets)
    embedded_input = Embedding(num_buckets, embed_dim)(ids)
    return Bidirectional(LSTM(rnn_state_size, activation=tf.nn.relu))(
        embedded_input, mask=tf.strings.length(tokens)
    )


def get_model(
    rnn_state_size: int = 64, num_buckets: int = 40000, embed_dim: int = 12
) -> tf.keras.Model:
    """
    Return LSTM model for predicting label probabilities.
    Args:
        rnn_state_size: LSTM state size.
        num_buckets: Number of buckets to hash strings to integers.
        embed_dim: Size of token embeddings.
    Returns:
        model: A compiled LSTM model.
    """
    left_ph = Input((None,), dtype="string")
    bet_ph = Input((None,), dtype="string")
    right_ph = Input((None,), dtype="string")
    left_embs = bilstm(left_ph, rnn_state_size, num_buckets, embed_dim)
    bet_embs = bilstm(bet_ph, rnn_state_size, num_buckets, embed_dim)
    right_embs = bilstm(right_ph, rnn_state_size, num_buckets, embed_dim)
    layer = Concatenate(1)([left_embs, bet_embs, right_embs])
    layer = Dense(64, activation=tf.nn.relu)(layer)
    layer = Dense(32, activation=tf.nn.relu)(layer)
    probabilities = Dense(2, activation=tf.nn.softmax)(layer)
    model = tf.keras.Model(inputs=[bet_ph, left_ph, right_ph], outputs=probabilities)
    model.compile(tf.train.AdagradOptimizer(0.1), "categorical_crossentropy")
    return model


In [None]:
# # Check for the `spouse` words appearing to the left of the person mentions
# @labeling_function(resources=dict(spouses=spouses), pre=[get_left_tokens])
# def lf_husband_wife_left_window(x, spouses):
#     if len(set(spouses).intersection(set(x.person1_left_tokens))) > 0:
#         return POSITIVE
#     elif len(set(spouses).intersection(set(x.person2_left_tokens))) > 0:
#         return POSITIVE
#     else:
#         return ABSTAIN

In [None]:
# # Check for the person mentions having the same last name
# @labeling_function(pre=[get_person_last_names])
# def lf_same_last_name(x):
#     p1_ln, p2_ln = x.person_lastnames

#     if p1_ln and p2_ln and p1_ln == p2_ln:
#         return POSITIVE
#     return ABSTAIN

In [None]:
# # Check for the word `married` between person mentions
# @labeling_function()
# def lf_married(x):
#     return POSITIVE if "married" in x.between_tokens else ABSTAIN

In [None]:
# # Check for words that refer to `family` relationships between and to the left of the person mentions
# family = {
#     "father",
#     "mother",
#     "sister",
#     "brother",
#     "son",
#     "daughter",
#     "grandfather",
#     "grandmother",
#     "uncle",
#     "aunt",
#     "cousin",
# }
# family = family.union({f + "-in-law" for f in family})


# @labeling_function(resources=dict(family=family))
# def lf_familial_relationship(x, family):
#     return NEGATIVE if len(family.intersection(set(x.between_tokens))) > 0 else ABSTAIN


# @labeling_function(resources=dict(family=family), pre=[get_left_tokens])
# def lf_family_left_window(x, family):
#     if len(set(family).intersection(set(x.person1_left_tokens))) > 0:
#         return NEGATIVE
#     elif len(set(family).intersection(set(x.person2_left_tokens))) > 0:
#         return NEGATIVE
#     else:
#         return ABSTAIN

In [None]:
# # Check for `other` relationship words between person mentions
# other = {"boyfriend", "girlfriend", "boss", "employee", "secretary", "co-worker"}


# @labeling_function(resources=dict(other=other))
# def lf_other_relationship(x, other):
#     return NEGATIVE if len(other.intersection(set(x.between_tokens))) > 0 else ABSTAIN

### Distant Supervision Labeling Functions

In addition to using factories that encode pattern matching heuristics, we can also write labeling functions that _distantly supervise_ data points. Here, we'll load in a list of known spouse pairs and check to see if the pair of persons in a candidate matches one of these.

[**DBpedia**](http://wiki.dbpedia.org/): Our database of known spouses comes from DBpedia, which is a community-driven resource similar to Wikipedia but for curating structured data. We'll use a preprocessed snapshot as our knowledge base for all labeling function development.

We can look at some of the example entries from DBPedia and use them in a simple distant supervision labeling function.

Make sure `dbpedia.pkl` is in the `spouse/data` directory.

In [None]:
# with open("data/dbpedia.pkl", "rb") as f:
#     known_spouses = pickle.load(f)

# list(known_spouses)[0:5]

In [None]:
# @labeling_function(resources=dict(known_spouses=known_spouses), pre=[get_person_text])
# def lf_distant_supervision(x, known_spouses):
#     p1, p2 = x.person_names
#     if (p1, p2) in known_spouses or (p2, p1) in known_spouses:
#         return POSITIVE
#     else:
#         return ABSTAIN

In [None]:
# from preprocessors import last_name

# # Last name pairs for known spouses
# last_names = set(
#     [
#         (last_name(x), last_name(y))
#         for x, y in known_spouses
#         if last_name(x) and last_name(y)
#     ]
# )


# @labeling_function(resources=dict(last_names=last_names), pre=[get_person_last_names])
# def lf_distant_supervision_last_names(x, last_names):
#     p1_ln, p2_ln = x.person_lastnames

#     return (
#         POSITIVE
#         if (p1_ln != p2_ln)
#         and ((p1_ln, p2_ln) in last_names or (p2_ln, p1_ln) in last_names)
#         else ABSTAIN
#     )

#### Apply Labeling Functions to the Data
We create a list of labeling functions and apply them to the data

In [None]:
# from snorkel.labeling import PandasLFApplier

# lfs = [
#     lf_husband_wife,
#     lf_husband_wife_left_window,
#     lf_same_last_name,
#     lf_married,
#     lf_familial_relationship,
#     lf_family_left_window,
#     lf_other_relationship,
#     lf_distant_supervision,
#     lf_distant_supervision_last_names,
# ]
# applier = PandasLFApplier(lfs)

In [None]:
# from snorkel.labeling import LFAnalysis

# L_dev = applier.apply(df_dev)
# L_train = applier.apply(df_train)

In [None]:
# LFAnalysis(L_dev, lfs).lf_summary(Y_dev)

### Training the Label Model

Now, we'll train a model of the LFs to estimate their weights and combine their outputs. Once the model is trained, we can combine the outputs of the LFs into a single, noise-aware training label set for our extractor.

In [None]:
# from snorkel.labeling import LabelModel

# label_model = LabelModel(cardinality=2, verbose=True)
# label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345)

### Label Model Metrics
Since our dataset is highly unbalanced (91% of the labels are negative), even a trivial baseline that always outputs negative can get a high accuracy. So we evaluate the label model using the F1 score and ROC-AUC rather than accuracy.

In [None]:
from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds

probs_dev = label_model.predict_proba(L_dev)
preds_dev = probs_to_preds(probs_dev)
print(
    f"Label model f1 score: {metric_score(y_dev, preds_dev, probs=probs_dev, metric='f1')}"
)
print(
    f"Label model roc-auc: {metric_score(y_dev, preds_dev, probs=probs_dev, metric='roc_auc')}"
)

### Part 4: Training our End Extraction Model

In this final section of the tutorial, we'll use our noisy training labels to train our end machine learning model. We start by filtering out training data points which did not recieve a label from any LF, as these data points contain no signal.


In [None]:
from snorkel.labeling import filter_unlabeled_dataframe

probs_train = label_model.predict_proba(L_train)
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

Next, we train a simple [LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory) network for classifying candidates. `tf_model` contains functions for processing features and building the keras model for training and evaluation.

In [None]:
from tf_model import get_model, get_feature_arrays
from utils import get_n_epochs

X_train = get_feature_arrays(df_train_filtered)
model = get_model()
batch_size = 64
model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=get_n_epochs())

Finally, we evaluate the trained model by measuring its F1 score and ROC_AUC.

In [None]:
X_test = get_feature_arrays(df_test)
probs_test = model.predict(X_test)
preds_test = probs_to_preds(probs_test)
print(
    f"Test F1 when trained with soft labels: {metric_score(Y_test, preds=preds_test, metric='f1')}"
)
print(
    f"Test ROC-AUC when trained with soft labels: {metric_score(Y_test, probs=probs_test, metric='roc_auc')}"
)

## Summary
In this tutorial, we showed how Snorkel can be used for Information Extraction. We demonstrated how to create LFs that leverage keywords and external knowledge bases (distant supervision). Finally, we showed how a model trained using the probabilistic outputs of the Label Model can achieve comparable performance while generalizing to all data points.