# Prepare Fonduer

In [17]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys
import logging
import sqlalchemy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
PARALLEL = 6
ATTRIBUTE = "jobs_companie"
conn_string = 'postgresql://user@127.0.0.1:8001/'

In [7]:
engine = sqlalchemy.create_engine(conn_string)
conn = engine.connect()

In [None]:
# conn.execute("commit")
# conn.execute("create database "+ATTRIBUTE)
# conn.close()

In [8]:
from fonduer import Meta, init_logging

# Configure logging for Fonduer
init_logging(log_dir="logs")

session = Meta.init(conn_string+ATTRIBUTE).Session()

[2022-04-19 17:01:07,887][INFO] fonduer.meta:49 - Setting logging directory to: logs/2022-04-19_17-01-07
[2022-04-19 17:01:07,888][INFO] fonduer.meta:134 - Connecting user:user to 127.0.0.1:8001/jobs_companie
[2022-04-19 17:01:08,406][INFO] fonduer.meta:162 - Initializing the storage schema


### Parse Docs

In [9]:
from fonduer.parser.preprocessors import HTMLDocPreprocessor
from fonduer.parser import Parser

# docs_path = 'data/html/'
# # pdf_path = 'data/pdf/'

# max_docs = 100
# doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

In [None]:
# corpus_parser = Parser(session, structural=True, lingual=True)#, visual=True)#, pdf_path=pdf_path)
# corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

In [11]:
from fonduer.parser.models import Document, Sentence

print(f"Documents: {session.query(Document).count()}")
print(f"Sentences: {session.query(Sentence).count()}")

docs = session.query(Document).order_by(Document.name).all()

Documents: 5
Sentences: 578


### Create Enteties

In [12]:
from fonduer.candidates.models import mention_subclass

Job = mention_subclass("Job")
Company = mention_subclass("Company")

In [69]:
from fonduer.candidates import MentionNgrams

job_ngrams = MentionNgrams(n_max=3, n_min=1)
company_ngrams = MentionNgrams(n_max=3, n_min=1)

In [95]:
from fonduer.candidates.matchers import LambdaFunctionMatcher

companies = {export[spot]["text"] for spot in export if export[spot]["label"] == "Company"}
jobs = {export[spot]["text"] for spot in export if export[spot]["label"] == "Job"}

def is_job(mention):
    if mention.get_span() in jobs:
        return True
    else:
        False

def is_company(mention):
    if mention.get_span() in companies:
        return True
    else:
        False
    
job_matcher = LambdaFunctionMatcher(func=is_job)
company_matcher = LambdaFunctionMatcher(func=is_company)

In [96]:
from fonduer.candidates import MentionExtractor

mention_extractor = MentionExtractor(
    session,
    [Job, Company],
    [job_ngrams, company_ngrams],
    [job_matcher, company_matcher],
)

In [97]:
from fonduer.candidates.models import Mention

mention_extractor.apply(docs)
num_jobs = session.query(Job).count()
num_companies = session.query(Company).count()
print(
    f"Total Mentions: {session.query(Mention).count()} ({num_jobs} jobs, {num_companies} companies)"
)

[2022-04-19 18:15:22,567][INFO] fonduer.candidates.mentions:467 - Clearing table: job
[2022-04-19 18:15:22,584][INFO] fonduer.candidates.mentions:467 - Clearing table: company
[2022-04-19 18:15:22,588][INFO] fonduer.candidates.mentions:475 - Cascading to clear table: job_company
[2022-04-19 18:15:22,594][INFO] fonduer.candidates.mentions:475 - Cascading to clear table: presidentname_placeofbirth
[2022-04-19 18:15:22,598][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/5 [00:00<?, ?it/s]

Total Mentions: 34 (14 jobs, 20 companies)


In [98]:
from fonduer.candidates.models import candidate_subclass

JobCompany = candidate_subclass(
    "JobCompany", [Job, Company]
)

In [99]:
from fonduer.candidates import CandidateExtractor

candidate_extractor = CandidateExtractor(session, [JobCompany])
candidate_extractor.apply(docs)

[2022-04-19 18:15:29,214][INFO] fonduer.candidates.candidates:137 - Clearing table job_company (split 0)
[2022-04-19 18:15:29,217][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/5 [00:00<?, ?it/s]

In [106]:
test_cand = candidate_extractor.get_candidates()

  .filter(candidate_class.id.in_(sub_query))


### Load Gold data

In [88]:
from bs4 import BeautifulSoup  # type: ignore
from lxml import etree  # type: ignore
import json
import os

def get_filename(label_studio_str: str):
    split = label_studio_str.split("-")  # strip id
    full = "".join(split[1:])
    split = full.split(".")
    result = "".join(split[:-1])
    return result

def get_html_tree_from_string(html_string):
    soup = BeautifulSoup(html_string)
    dom = etree.HTML(str(soup))
    root = dom.getroottree()
    return root

def get_absolute_xpath(rel_xpath, dom):
    res = dom.xpath("/"+rel_xpath)[0]
    return dom.getpath(res.getparent())
    
def load_ls_export(export_path, session):
    with open(export_path, "r") as fin:
        export = json.load(fin)
    spots = {}
    relations = []
    for annotated_doc in export:
        filename = get_filename(annotated_doc["file_upload"])
        fonduer_doc_id = str(session.query(Document.id).filter(Document.name==filename).first()[0])
        tree = get_html_tree_from_string(annotated_doc["data"]["text"])  # recreate html tree for doc

        for annotations in annotated_doc["annotations"]:
            if not annotations["result"]:
                continue
            for entety in annotations["result"]:
                if entety.get("value"):
                    xpath_rel = entety["value"]["start"]
                    xpath_abs = get_absolute_xpath(xpath_rel, tree)
                    fd_sentence_id = session.query(Sentence.id).filter(Sentence.document_id==fonduer_doc_id, Sentence.xpath==xpath_abs).first()
                    label = entety["value"]["labels"][0]
                    ls_ID = entety["id"]
                    id_label[ls_ID]=label
                    
                    spots[ls_ID] = {
                        # "xpath_rel": xpath_rel, 
                        "xpath_abs": xpath_abs,
                        "label": label, 
                        "text": entety["value"]["text"].replace("\\n", ""),
                        "ls_ID": ls_ID,
                        "fd_sentence_id": str(fd_sentence_id[0]),
                        "filename": filename,
                        "fonduer_doc_id": fonduer_doc_id,
                        }
                else:
                    relations.append((entety["from_id"], entety["to_id"], entety["labels"]))
            # docs.append(doc)
    return spots, relations, id_label

def get_gold_table(relations, export):
    tabulated = []
    for relation in relations:
        entety_1 = export.get(relation[0])
        entety_2 = export.get(relation[1])
        assert entety_1["fonduer_doc_id"] == entety_2["fonduer_doc_id"]
        tabulated.append((
            entety_1["fonduer_doc_id"], 
            entety_1["text"], 
            entety_1["fd_sentence_id"], 
            entety_2["fonduer_doc_id"], 
            entety_2["text"], 
            entety_2["fd_sentence_id"], 
            ))
    return tabulated

In [138]:
export_path = os.path.join("data", "export", "export_1.json")

export, relations, id_label = load_ls_export(export_path, session)
gold_table = get_gold_table(relations, export)

### Create gold label function

In [163]:
def gold(cand):
    canddidate_tuple=(
        str(cand[1].document_id),
        str(cand[1].context.get_span()),
        str(cand[1].context.sentence.id),        
        str(cand[0].document_id),
        str(cand[0].context.get_span()),
        str(cand[0].context.sentence.id),
    )
    if canddidate_tuple in gold_table:
        return 1
    else:
        return 0


### Label docs

In [164]:
from fonduer.supervision.models import GoldLabel
from fonduer.supervision import Labeler

In [165]:
labeler = Labeler(session, [JobCompany])

In [166]:
%time labeler.apply(docs=docs, lfs=[[gold]], table=GoldLabel, train=False, parallelism=PARALLEL)

[2022-04-19 18:29:51,218][INFO] fonduer.supervision.labeler:330 - Clearing Labels (split ALL)
  query = self.session.query(table).filter(table.candidate_id.in_(sub_query))
[2022-04-19 18:29:51,225][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/5 [00:00<?, ?it/s]

CPU times: user 16.3 ms, sys: 39.9 ms, total: 56.3 ms
Wall time: 196 ms


In [174]:
all_gold = labeler.get_gold_labels(candidate_extractor.get_candidates(), annotator="gold")

  .filter(candidate_class.id.in_(sub_query))


In [175]:
all_gold[0].sum()

5