In [67]:
from bs4 import BeautifulSoup  # type: ignore
from lxml import etree  # type: ignore
import json
import os
from fonduer.parser.models import Document, Sentence
from fonduer.candidates import MentionNgrams
from fonduer.candidates import MentionExtractor
from fonduer.candidates.models import mention_subclass
from fonduer.candidates.matchers import LambdaFunctionMatcher


In [3]:
def get_filename(label_studio_str: str):
    split = label_studio_str.split("-")  # strip id
    full = "".join(split[1:])
    split = full.split(".")
    result = "".join(split[:-1])
    return result

def get_html_tree_from_string(html_string):
    soup = BeautifulSoup(html_string)
    dom = etree.HTML(str(soup))
    root = dom.getroottree()
    return root

def get_absolute_xpath(rel_xpath, dom):
    res = dom.xpath("/"+rel_xpath)[0]
    return dom.getpath(res.getparent())

In [18]:

def load_ls_export(export_path, session):
    with open(export_path, "r") as fin:
        export = json.load(fin)
    docs = []
    relations = []
    id_label = {}
    for annotated_doc in export:
        filename = get_filename(annotated_doc["file_upload"])
        fonduer_doc_id = str(session.query(Document.id).filter(Document.name==filename).first()[0])
        tree = get_html_tree_from_string(annotated_doc["data"]["text"])  # recreate html tree for doc

        doc = {}
        doc["filename"] = filename
        doc["fonduer_doc_id"] = fonduer_doc_id
        doc["spots"] = []

        for annotations in annotated_doc["annotations"]:
            if not annotations["result"]:
                continue
            for entety in annotations["result"]:
                if entety.get("value"):
                    xpath_rel = entety["value"]["start"]
                    xpath_abs = get_absolute_xpath(xpath_rel, tree)
                    fd_sentence_id = session.query(Sentence.id).filter(Sentence.document_id==fonduer_doc_id, Sentence.xpath==xpath_abs).first()
                    label = entety["value"]["labels"][0]
                    ls_ID = entety["id"]
                    id_label[ls_ID]=label
        
                    doc["spots"].append({
                        # "xpath_rel": xpath_rel, 
                        "xpath_abs": xpath_abs,
                        "label": label, 
                        "text": entety["value"]["text"],
                        "ls_ID": ls_ID,
                        "fd_sentence_id": fd_sentence_id
                        })
                else:
                    relations.append((entety["from_id"], entety["to_id"], entety["labels"]))
            docs.append(doc)
    return docs, relations, id_label

In [19]:
def resolve_relations(relations, id_label):
    resolved = []
    for relation in relations:
        resolved.append((id_label.get(relation[0]), id_label.get(relation[1])))
    return set(resolved)

In [33]:
def determine_ngram_size(export, label):
    lengths = []
    for doc in export:
        for spot in doc["spots"]:
            if spot["label"] == label:
                lengths.append(len(spot.get("text").split(" ")))
    return MentionNgrams(n_max=max(lengths), n_min=min(lengths))

In [21]:
#def create_mention_functions(export):
patch = {}
for doc in export:
    idx = doc["fonduer_doc_id"]
    patch[idx]={}
    for spot in doc["spots"]:
        if spot["label"] not in patch[idx]:
            patch[idx][spot["label"]]={}
            
        fonduer_sentence_id = str(spot["fd_sentence_id"])
        text = spot["text"].replace("\\n", "")
        patch[idx][spot["label"]][fonduer_sentence_id] = text
#    return patch
#create_mention_functions(export)

In [80]:
class ls_export:
    def __init__(self, path, session):
        self.export, self.relations, self.id_label = load_ls_export(path, session)
        self.session = session
        self.relation_types = resolve_relations(self.relations, self.id_label)
    
    def create_mention_subclasses(self):
        self.mention_subclasses = {}
        for relation in self.relation_types:
            for entety in relation:
                if entety not in self.mention_subclasses.keys():
                    self.mention_subclasses[entety] = mention_subclass(entety)
    
    def get_ngram_spaces(self):
        self.ngram_spaces = {}
        for mention_subclas in self.mention_subclasses:
            self.ngram_spaces[mention_subclas] = determine_ngram_size(self.export, mention_subclas)

    def create_matching_fuctions(self):
        def create_function(spot):
            def function_template(mention):
                doc_id = str(mention.sentence.document.id)
                sentence_id = str(mention.sentence.id)
                
                if doc_id in patch:
                    if spot in patch[doc_id].keys():
                        if sentence_id in patch[doc_id][spot].keys():
                            if mention.get_span() == patch[doc_id][spot][sentence_id]:
                                return True
                return False
            return function_template

        self.matching_fuctions = {}
        for mention_subclas in self.mention_subclasses:
            function = create_function(mention_subclas)
            self.matching_fuctions[mention_subclas] = LambdaFunctionMatcher(func=function)



        
    # def create_mention_extractors(self):
    #     self.mention_extractors = []

        
            
            




    #         mention_extractor = MentionExtractor(
    #             session,
    #             [Job, Company],
    #             [job_ngrams, company_ngrams],
    #             [job_matcher, company_matcher],
    #         )
        

# Run

In [9]:
base_dir = "data"
export_dir = os.path.join(base_dir, "export")
export_path = os.path.join(export_dir, "export_1.json")

In [8]:
from fonduer import Meta, init_logging

ATTRIBUTE = "jobs_companie"
conn_string = 'postgresql://user@127.0.0.1:8001/' + ATTRIBUTE

# Configure logging for Fonduer
init_logging(log_dir="logs")

session = Meta.init(conn_string).Session()

[2022-04-19 12:27:31,060][INFO] fonduer.meta:49 - Setting logging directory to: logs/2022-04-19_12-27-31
[2022-04-19 12:27:31,071][INFO] fonduer.meta:134 - Connecting user:user to 127.0.0.1:8001/jobs_companie
[2022-04-19 12:27:31,139][INFO] fonduer.meta:162 - Initializing the storage schema


In [81]:
export = ls_export(export_path, session)

In [82]:
export.create_mention_subclasses()

In [83]:
export.mention_subclasses

{'Company': fonduer.candidates.models.mention.Company,
 'Job': fonduer.candidates.models.mention.Job}

In [84]:
export.get_ngram_spaces()

In [85]:
export.ngram_spaces["Company"]

<fonduer.candidates.mentions.MentionNgrams at 0x7fbaff0bb940>

In [86]:
export.create_matching_fuctions()

In [87]:
export.matching_fuctions

{'Company': <fonduer.candidates.matchers.LambdaFunctionMatcher at 0x7fbaff0bbca0>,
 'Job': <fonduer.candidates.matchers.LambdaFunctionMatcher at 0x7fbaff0bbfd0>}