# LabelstudioToFonduer example
This notebook describes how `LabelstudioToFonduer` can be used to convert a Label Studio export into Fonduer gold labels. 

While most of the notebook sets up the fonduer pipeline for the example data, in Section X `LabelstudioToFonduer` is described.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys
import logging
import sqlalchemy

In [36]:
PARALLEL = 8 
ATTRIBUTE = "jobs_companie"
conn_string = 'postgresql://postgres:postgres@127.0.0.1:5432/'

In [37]:
engine = sqlalchemy.create_engine(conn_string)
conn = engine.connect()

In [38]:
# Create DB
conn.execute("commit")
conn.execute("create database "+ATTRIBUTE)
conn.close()

# Delete DB
# conn.execute("commit")
# conn.execute("drop database jobs_companie")#+ATTRIBUTE)
# conn.close()

# force delete DB
# conn.execute("commit")
# conn.execute("""SELECT 
#     pg_terminate_backend(pid) 
# FROM 
#     pg_stat_activity 
# WHERE 
#     -- don't kill my own connection!
#     pid <> pg_backend_pid()
#     -- don't kill the connections to other databases
#     AND datname = 'jobs_companie'
#     ;""")
# # conn.close()

# List DBs
engine.execute('SELECT datname FROM pg_database;').fetchall()

[('postgres',), ('template1',), ('template0',), ('jobs_companie',)]

In [40]:
from fonduer import Meta, init_logging

# Configure logging for Fonduer
init_logging(log_dir="logs")

session = Meta.init(conn_string + ATTRIBUTE).Session()

[2022-05-17 13:30:50,494][INFO] fonduer.meta:53 - Logging was already initialized to use logs/2022-05-17_13-25-49.  To configure logging manually, call fonduer.init_logging before initialiting Meta.
[2022-05-17 13:30:50,516][INFO] fonduer.meta:135 - Connecting user:postgres to 127.0.0.1:5432/jobs_companie
[2022-05-17 13:30:50,524][INFO] fonduer.meta:162 - Initializing the storage schema


### 1. Parse Docs

In [41]:
from fonduer.parser.preprocessors import HTMLDocPreprocessor
from fonduer.parser import Parser

docs_path = 'data/html/'
# pdf_path = 'data/pdf/'

max_docs = 100
doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

In [42]:
corpus_parser = Parser(session, structural=True, lingual=True)#, visual=True)#, pdf_path=pdf_path)
corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

[2022-05-17 13:30:54,742][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/5 [00:00<?, ?it/s]

In [43]:
from fonduer.parser.models import Document, Sentence

print(f"Documents: {session.query(Document).count()}")
print(f"Sentences: {session.query(Sentence).count()}")

docs = session.query(Document).order_by(Document.name).all()

Documents: 5
Sentences: 578


### 2. LabelstudioToFonduer

In [12]:
! pip install -e labelstudio-to-fonduer/src

Defaulting to user installation because normal site-packages is not writeable
Obtaining file:///workspace/labelstudio-to-fonduer/src
  Preparing metadata (setup.py) ... [?25ldone
[?25hInstalling collected packages: LabelstudioToFonduer
  Attempting uninstall: LabelstudioToFonduer
    Found existing installation: LabelstudioToFonduer 0.0.1
    Uninstalling LabelstudioToFonduer-0.0.1:
      Successfully uninstalled LabelstudioToFonduer-0.0.1
  Running setup.py develop for LabelstudioToFonduer
Successfully installed LabelstudioToFonduer-0.0.1


In [46]:
from LabelstudioToFonduer.ls_export import Export

export = Export(session, "export_1.json")

In [None]:
gold = export.is_gold

### 3. Setup data model

In [44]:
from fonduer.candidates.models import mention_subclass

Job = mention_subclass("Job")
Company = mention_subclass("Company")

In [45]:
from fonduer.candidates import MentionNgrams

job_ngrams = MentionNgrams(n_max=3, n_min=1)
company_ngrams = MentionNgrams(n_max=3, n_min=1)

In [47]:
from fonduer.candidates.matchers import LambdaFunctionMatcher

companies = export.labels["Company"]
jobs = export.labels["Job"]

def is_job(mention):
    if mention.get_span() in jobs:
        return True
    else:
        False

def is_company(mention):
    if mention.get_span() in companies:
        return True
    else:
        False
    
job_matcher = LambdaFunctionMatcher(func=is_job)
company_matcher = LambdaFunctionMatcher(func=is_company)

In [48]:
from fonduer.candidates import MentionExtractor

mention_extractor = MentionExtractor(
    session,
    [Job, Company],
    [job_ngrams, company_ngrams],
    [job_matcher, company_matcher],
)

In [49]:
from fonduer.candidates.models import Mention

mention_extractor.apply(docs)
num_jobs = session.query(Job).count()
num_companies = session.query(Company).count()
print(
    f"Total Mentions: {session.query(Mention).count()} ({num_jobs} jobs, {num_companies} companies)"
)

[2022-05-17 13:37:51,303][INFO] fonduer.candidates.mentions:467 - Clearing table: job
[2022-05-17 13:37:51,340][INFO] fonduer.candidates.mentions:467 - Clearing table: company
[2022-05-17 13:37:51,356][INFO] fonduer.candidates.mentions:475 - Cascading to clear table: job_company
[2022-05-17 13:37:51,379][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/5 [00:00<?, ?it/s]

Total Mentions: 34 (14 jobs, 20 companies)


In [50]:
from fonduer.candidates.models import candidate_subclass

JobCompany = candidate_subclass(
    "JobCompany", [Job, Company]
)

In [51]:
from fonduer.candidates import CandidateExtractor

candidate_extractor = CandidateExtractor(session, [JobCompany])
candidate_extractor.apply(docs)

[2022-05-17 13:40:54,145][INFO] fonduer.candidates.candidates:138 - Clearing table job_company (split 0)
[2022-05-17 13:40:54,220][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/5 [00:00<?, ?it/s]

In [52]:
train_cands = candidate_extractor.get_candidates()

### 4. Label docs

In [53]:
from fonduer.supervision.models import GoldLabel
from fonduer.supervision import Labeler

In [54]:
labeler = Labeler(session, [JobCompany])

In [65]:
%time labeler.apply(docs=docs, lfs=[[gold]], table=GoldLabel, train=True, parallelism=PARALLEL)

[2022-05-17 13:43:03,386][INFO] fonduer.supervision.labeler:330 - Clearing Labels (split ALL)
  query = self.session.query(table).filter(table.candidate_id.in_(sub_query))
[2022-05-17 13:43:03,589][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/5 [00:00<?, ?it/s]

CPU times: user 778 ms, sys: 319 ms, total: 1.1 s
Wall time: 2.51 s


In [66]:
all_gold = labeler.get_gold_labels(train_cands)

In [70]:
all_gold[0].sum()

5