In [1]:
import os
import json

In [2]:
base_dir = "data"
export_dir = os.path.join(base_dir, "export")
html_dir = os.path.join(base_dir, "html")

### Read Labelstudio Export

In [3]:
with open(os.path.join(export_dir, "export_1.json"), "r") as fin:
    export = json.load(fin)

### Create Fonduer Objects

In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys
import logging
import sqlalchemy

In [5]:
PARALLEL = 6  # assuming a quad-core machine
ATTRIBUTE = "jobs_companie"
conn_string = 'postgresql://user@127.0.0.1:8001/' + ATTRIBUTE

In [8]:
engine = sqlalchemy.create_engine('postgresql://user@127.0.0.1:8001/')
conn = engine.connect()

In [7]:
# conn.execute("commit")
# conn.execute("create database "+ATTRIBUTE)
# conn.close()

In [9]:
from fonduer import Meta, init_logging

# Configure logging for Fonduer
init_logging(log_dir="logs")

session = Meta.init(conn_string).Session()

[2022-04-19 10:19:48,236][INFO] fonduer.meta:49 - Setting logging directory to: logs/2022-04-19_10-19-48
[2022-04-19 10:19:48,237][INFO] fonduer.meta:134 - Connecting user:user to 127.0.0.1:8001/jobs_companie
[2022-04-19 10:19:48,746][INFO] fonduer.meta:162 - Initializing the storage schema


In [10]:
from fonduer.parser.preprocessors import HTMLDocPreprocessor
from fonduer.parser import Parser

# docs_path = 'data/html/'
# # pdf_path = 'data/pdf/'

# max_docs = 100
# doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)

In [10]:
corpus_parser = Parser(session, structural=True, lingual=True)#, visual=True)#, pdf_path=pdf_path)

In [11]:
corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

In [11]:
from fonduer.parser.models import Document, Sentence

print(f"Documents: {session.query(Document).count()}")
print(f"Sentences: {session.query(Sentence).count()}")

Documents: 5
Sentences: 578


In [27]:
docs = session.query(Document).order_by(Document.name).all()

# Match

In [12]:
import fonduer
from bs4 import BeautifulSoup  # type: ignore
from lxml import etree  # type: ignore

In [13]:
def get_filename(label_studio_str: str):
    split = label_studio_str.split("-")  # strip id
    full = "".join(split[1:])
    split = full.split(".")
    result = "".join(split[:-1])
    return result

In [14]:
def get_html_tree_from_string(html_string):
    soup = BeautifulSoup(html_string)
    dom = etree.HTML(str(soup))
    root = dom.getroottree()
    return root

In [15]:
def get_absolute_xpath(rel_xpath, dom):
    res = dom.xpath("/"+rel_xpath)[0]
    return dom.getpath(res.getparent())

In [16]:
def parse_export(ls_export):
    with open(ls_export, "r") as fin:
        export = json.load(fin)
    
    for i, annotated_doc in enumerate(export):
        tree = get_html_tree_from_string(annotated_doc["data"]["text"])  # recreate html tree for doc

        for j, annotations in enumerate(annotated_doc["annotations"]):
            if not annotations["result"]:  # no annotations
                continue
            for k, entety in enumerate(annotations["result"]):
                if entety.get("value"):
                    xpath_rel = entety["value"]["start"]
                    text = entety["value"]["text"]

                    xpath_abs = get_absolute_xpath(xpath_rel, tree)

                    export[i]["annotations"][j]["result"][k]["value"][
                        "start_abs"
                    ] = xpath_abs

    return export

In [17]:
exp_fin = parse_export(os.path.join(export_dir, "export_1.json"))

In [18]:
labeled_docs={}
for annotated_doc in exp_fin:
    filename = get_filename(annotated_doc["file_upload"])
    fonduer_doc_id = str(session.query(Document.id).filter(Document.name==filename).first()[0])

    doc = {}
    doc["filename"] = filename
    doc["fonduer_doc_id"] = fonduer_doc_id
    doc["spots"] = []

    for annotations in annotated_doc["annotations"]:
        if not annotations["result"]:
            continue
        for entety in annotations["result"]:
            if entety.get("value"):
                xpath_abs = entety["value"]["start_abs"]
                text = entety["value"]["text"]
                
                sentence = session.query(Sentence).filter(Sentence.document_id==fonduer_doc_id, Sentence.xpath==xpath_abs).first()
                assert text.replace("\\n", "") in sentence.text
                
                xpath = entety["value"]["start"]
                label = entety["value"]["labels"]
                
                doc["spots"].append({"xpath": xpath, "label":label, "text":text, "fonduer_sentence_id":sentence.id})


    labeled_docs[filename]=doc

In [35]:
patch = {}
for doc in labeled_docs:
    idx = labeled_docs[doc]["fonduer_doc_id"]
    patch[idx]={}
    for spot in labeled_docs[doc]["spots"]:
        if spot["label"][0] not in patch[idx]:
            patch[idx][spot["label"][0]]={}
            
        fonduer_sentence_id = str(spot["fonduer_sentence_id"])
        text = spot["text"].replace("\\n", "")
        patch[idx][spot["label"][0]][fonduer_sentence_id] = text

In [36]:
patch

{'1': {'Company': {'582': 'NetTemps'}, 'Job': {'716': 'Backend Developer'}},
 '2': {'Job': {'663': 'Java Developer'}, 'Company': {'537': 'NetTemps'}},
 '90': {'Job': {'941': 'PowerBI Analyst'}, 'Company': {'808': 'NetTemps'}},
 '100': {'Job': {'917': 'Network Engineer'}, 'Company': {'787': 'NetTemps'}},
 '8': {'Job': {'710': 'Kurier'}, 'Company': {'583': 'NetTemps'}}}

# Pipeline

In [37]:
from fonduer.candidates.models import mention_subclass

Job = mention_subclass("Job")
Company = mention_subclass("Company")

In [38]:
### Matcher

In [39]:
from fonduer.candidates.matchers import LambdaFunctionMatcher

def annotated_job(mention):
    doc_id = str(mention.sentence.document.id)
    sentence_id = str(mention.sentence.id)
    
    if doc_id in patch:
        if "Job" in patch[doc_id].keys():
            if sentence_id in patch[doc_id]["Job"].keys():
                if mention.get_span() == patch[doc_id]["Job"][sentence_id]:
                    return True
    return False

def annotated_company(mention):
    doc_id = str(mention.sentence.document.id)
    sentence_id = str(mention.sentence.id)
    
    if doc_id in patch:
        if "Company" in patch[doc_id].keys():
            if sentence_id in patch[doc_id]["Company"].keys():
                if mention.get_span() == patch[doc_id]["Company"][sentence_id]:
                    return True
    return False

job_matcher = LambdaFunctionMatcher(func=annotated_job)
company_matcher = LambdaFunctionMatcher(func=annotated_company)


In [40]:
from fonduer.candidates import MentionNgrams

job_ngrams = MentionNgrams(n_max=3, n_min=1)
company_ngrams = MentionNgrams(n_max=3, n_min=1)

In [41]:
from fonduer.candidates import MentionExtractor

mention_extractor = MentionExtractor(
    session,
    [Job, Company],
    [job_ngrams, company_ngrams],
    [job_matcher, company_matcher],
)

In [42]:
from fonduer.candidates.models import Mention

mention_extractor.apply(docs)
num_jobs = session.query(Job).count()
num_companies = session.query(Company).count()
print(
    f"Total Mentions: {session.query(Mention).count()} ({num_jobs} jobs, {num_companies} companies)"
)

[2022-04-19 10:25:40,005][INFO] fonduer.candidates.mentions:467 - Clearing table: job
[2022-04-19 10:25:40,009][INFO] fonduer.candidates.mentions:467 - Clearing table: company
[2022-04-19 10:25:40,013][INFO] fonduer.candidates.mentions:475 - Cascading to clear table: presidentname_placeofbirth
[2022-04-19 10:25:40,018][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/5 [00:00<?, ?it/s]

Total Mentions: 10 (5 jobs, 5 companies)


In [43]:
from fonduer.candidates.models import candidate_subclass

JobCompany = candidate_subclass(
    "PresidentnamePlaceofbirth", [Job, Company]
)



In [44]:
from fonduer.candidates import CandidateExtractor

candidate_extractor = CandidateExtractor(session, [JobCompany])

In [45]:
candidate_extractor.apply(docs)
print(
        f"Number of Candidates: {session.query(JobCompany).count()}"
    )


[2022-04-19 10:25:45,399][INFO] fonduer.candidates.candidates:137 - Clearing table presidentname_placeofbirth (split 0)
[2022-04-19 10:25:45,401][INFO] fonduer.utils.udf:67 - Running UDF...


  0%|          | 0/5 [00:00<?, ?it/s]

Number of Candidates: 5


In [46]:
a = session.query(JobCompany).all()

In [47]:
for c in a:
    print(c)
    print("\n")

PresidentnamePlaceofbirth(Job(SpanMention("Backend Developer", sentence=716, chars=[0,16], words=[0,1])), Company(SpanMention("NetTemps", sentence=582, chars=[0,7], words=[0,0])))


PresidentnamePlaceofbirth(Job(SpanMention("Java Developer", sentence=663, chars=[0,13], words=[0,1])), Company(SpanMention("NetTemps", sentence=537, chars=[0,7], words=[0,0])))


PresidentnamePlaceofbirth(Job(SpanMention("Kurier", sentence=710, chars=[0,5], words=[0,0])), Company(SpanMention("NetTemps", sentence=583, chars=[0,7], words=[0,0])))


PresidentnamePlaceofbirth(Job(SpanMention("Network Engineer", sentence=917, chars=[0,15], words=[0,1])), Company(SpanMention("NetTemps", sentence=787, chars=[0,7], words=[0,0])))


PresidentnamePlaceofbirth(Job(SpanMention("PowerBI Analyst", sentence=941, chars=[0,14], words=[0,1])), Company(SpanMention("NetTemps", sentence=808, chars=[0,7], words=[0,0])))


