diff --git a/.travis.yml b/.travis.yml index cc9cae4..7f428a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ install: # command to run tests script: - pie-extended install-addons lasla - - nosetests ./tests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture + - nosetests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture --with-doctest after_success: - coverage combine - coveralls \ No newline at end of file diff --git a/README.md b/README.md index 56fb775..e5a3526 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,12 @@ The current system provide an easier access to adding **customized**: - disambiguation, - output formatting +## Install + +To install, simply do `pip install pie-extended`. Then, look at all available models. + +## Run on terminal + But on top of that, it provides a quick and easy way to use others models ! For example, in a shell : ```bash @@ -26,6 +32,53 @@ pie-extended tag laslsa your_file.txt will give you access to all you need ! +## Python API + +You can run the lemmatizer in your own scripts and retrieve token annotations as dictionaries: + +```python +from typing import List +from pie_extended.cli.sub import get_tagger, get_model, download + +# In case you need to download +do_download = False +if do_download: + for dl in download("lasla"): + x = 1 + +# model_path allows you to override the model loaded by another .tar +model_name = "lasla" +tagger = get_tagger(model_name, batch_size=256, device="cpu", model_path=None) + +sentences: List[str] = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit. "] +# Get the main object from the model (: data iterator + postprocesor +from pie_extended.models.lasla import get_iterator_and_processor +for sentence_group in sentences: + iterator, processor = get_iterator_and_processor() + print(tagger.tag_str(sentence_group, iterator=iterator, processor=processor) ) +``` + +will result in + +```python +[{'form': 'lorem', 'lemma': 'lor', 'POS': 'NOMcom', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'lorem'}, + {'form': 'ipsum', 'lemma': 'ipse', 'POS': 'PROdem', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'ipsum'}, + {'form': 'dolor', 'lemma': 'dolor', 'POS': 'NOMcom', 'morph': 'Case=Nom|Numb=Sing', 'treated': 'dolor'}, + {'form': 'sit', 'lemma': 'sum1', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3', + 'treated': 'sit'}, + {'form': 'amet', 'lemma': 'amo', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3', + 'treated': 'amet'}, {'form': ',', 'lemma': ',', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ','}, + {'form': 'consectetur', 'lemma': 'consector2', 'POS': 'VER', + 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Dep|Person=3', 'treated': 'consectetur'}, + {'form': 'adipiscing', 'lemma': 'adipiscor', 'POS': 'VER', 'morph': 'Tense=Pres|Voice=Dep', 'treated': 'adipiscing'}, + {'form': 'elit', 'lemma': 'elio', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Ind|Tense=Pres|Voice=Act|Person=3', + 'treated': 'elit'}, {'form': '.', 'lemma': '.', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '.'}] +``` + +## Add a model + +ToDo: Documentation + ## Warning This is an extremely early build, subject to change here and there. But it is functional ! \ No newline at end of file diff --git a/pie_extended/cli/__init__.py b/pie_extended/cli/__init__.py index 0742dbe..b4d727c 100644 --- a/pie_extended/cli/__init__.py +++ b/pie_extended/cli/__init__.py @@ -58,7 +58,14 @@ def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path) """ Tag as many [filepath] as you want with [model] """ from tqdm import tqdm click.echo(click.style("Getting the tagger", bold=True)) - tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path) + try: + tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path) + except FileNotFoundError as e: + click.echo("Model not found: please make sure you have downloaded the model files with " + "pie-extended download " + model) + if debug: + raise e + return failures = [] for file in tqdm(filepath): try: diff --git a/pie_extended/cli/sub.py b/pie_extended/cli/sub.py index 3f1f917..13c2e56 100644 --- a/pie_extended/cli/sub.py +++ b/pie_extended/cli/sub.py @@ -50,8 +50,8 @@ def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None) def tag_file(model: str, tagger: ExtensibleTagger, fpath): module = get_model(model) - iterator, formatter = getattr(module, "get_iterator_and_formatter")() - tagger.tag_file(fpath, iterator=iterator, formatter_class=formatter) + iterator, processor = getattr(module, "get_iterator_and_processor")() + tagger.tag_file(fpath, iterator=iterator, processor=processor) return True diff --git a/pie_extended/models/fro/__init__.py b/pie_extended/models/fro/__init__.py index b2d3ad9..86506e2 100644 --- a/pie_extended/models/fro/__init__.py +++ b/pie_extended/models/fro/__init__.py @@ -1,5 +1,5 @@ -from ...utils import Metadata, File ,get_path -from .classes import get_iterator_and_formatter +from ...utils import Metadata, File, get_path +from .get import get_iterator_and_processor from ...pipeline.iterators.proto import DataIterator DESC = Metadata( diff --git a/pie_extended/models/fro/classes.py b/pie_extended/models/fro/classes.py deleted file mode 100644 index 99fff84..0000000 --- a/pie_extended/models/fro/classes.py +++ /dev/null @@ -1,152 +0,0 @@ -import regex as re -from typing import List -from ...pipeline.formatters.glue import GlueFormatter as SourceGlueFormatter -from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer -from pie_extended.pipeline.iterators.proto import DataIterator - -# Uppercase regexp -_uppercase = re.compile("^[A-ZÉÈÀÂÊÎÔÛŶÄËÏÖÜŸ]$") - -_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" -_Dots_collections = r"[" + _Dots_except_apostrophe + "‘’]" -_RomanNumber = r"(?:M{1,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})" \ - r"(?:IX|IV|V?I{0,3})|M{0,4}(?:CM|C?D|D?C{1,3})" \ - r"(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})|M{0,4}" \ - r"(?:CM|CD|D?C{0,3})(?:XC|X?L|L?X{1,3})" \ - r"(?:IX|IV|V?I{0,3})|M{0,4}(?:CM|CD|D?C{0,3})" \ - r"(?:XC|XL|L?X{0,3})(?:IX|I?V|V?I{1,3}))" - - -class MemorizingTokenizer(SourceMemorizingTokenizer): - re_add_space_around_punct = re.compile(r"(\s*)(\.+[^\w\s\'’ʼ])(\s*)") - re_add_space_around_apostrophe_that_are_quotes = re.compile( - r"((((?<=[\W])[\'’ʼ]+(?=[\W]))|((?<=[\w])[\'’ʼ]+(?=[\W]))|((?<=[\W])[\'’ʼ]+(?=[\w]))))" - # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter - # ?'. or manger'_ or _'Bonjour - ) - re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)") - re_remove_ending_apostrophe = re.compile(r"(?<=\w)([\'’ʼ])") - _sentence_boundaries = re.compile( - r"([" + _Dots_except_apostrophe + r"]+\s*)+" - ) - roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.") - - def __init__(self): - super(MemorizingTokenizer, self).__init__( - sentence_tokenizer=self._sentence_tokenizer, - word_tokenizer=self._word_tokenizer, - normalizer=self._normalizer - ) - self.tokens = [] - - @staticmethod - def _sentence_tokenizer_merge_matches(match): - """ Best way we found to deal with repeating groups""" - start, end = match.span() - return match.string[start:end] + "" - - @classmethod - def _real_sentence_tokenizer(cls, string: str) -> List[str]: - string = cls._sentence_boundaries.sub(cls._sentence_tokenizer_merge_matches, string) - string = string.replace("_DOT_", ".") - return string.split("") - - @staticmethod - def _word_tokenizer(data): - # ICI, il faut que tu tokenizes toi-meme avec une fonction à toi - return data.split() - - def _sentence_tokenizer(self, data): - sentences = list() - data = self.normalizer(data) - for sent in self._real_sentence_tokenizer(data): - sent = sent.strip() - if sent: - sentences.append(sent) - yield from sentences - - def _replacer(self, inp: str): - out = self.re_remove_ending_apostrophe.sub("", inp) - return out - - def _normalizer(self, data: str): - data = self.re_remove_ending_apostrophe.sub( - r"\g<1> ", - self.re_add_space_around_apostrophe_that_are_quotes.sub( - r" \g<2> ", - self.re_add_space_around_punct.sub( - r" \g<2> ", - self.roman_number_dot.sub( - r"_DOT_\g<1>_DOT_", - data - ) - ) - ) - ) - return data - - -class GlueFormatter(SourceGlueFormatter): - HEADERS = ["form", "lemma", "POS", "morph", "treated_token"] - MORPH_PART = ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"] - - PONCTU = re.compile(r"^\W+$") - NUMBER = re.compile(r"\d+") - PONFORT = [".", "...", "!", "?"] - - def __init__(self, tokenizer_memory: MemorizingTokenizer): - super(GlueFormatter, self).__init__(tokenizer_memory=tokenizer_memory) - - def rule_based(cls, token): - if cls.PONCTU.match(token): - lemma = token - if token in GlueFormatter.PONFORT: - pos = "PONfrt" - else: - pos = "PONfbl" - return [token, lemma, pos, "MORPH=empty", token] - - def format_line(self, token, tags, ignored=False): - tags = list(tags) - lemma = tags[self.tasks.index("lemma")] - index, input_token, out_token = self.tokenizer_memory.tokens.pop(0) - - if token != out_token: - raise Exception("The output token does not match our inputs %s : %s" % (token, out_token)) - - overwriten = self.rule_based(out_token) - - if overwriten: - return overwriten - - if type(self).NUMBER.match(token): # This would push for sending the whole elements to rule_based and - # not the token only - lemma = token - tags[self.tasks.index(self.pos_tag)] = "ADJcar" - - return [ - input_token, - lemma, - tags[self.tasks.index(self.pos_tag)], - "|".join( - "{cat}={tag}".format( - cat=morph_part, - tag=tags[self.tasks.index(morph_part.replace(".", ""))] - ) - for morph_part in GlueFormatter.MORPH_PART - if morph_part.replace(".", "") in self.tasks and - tags[self.tasks.index(morph_part.replace(".", ""))] != "_" - ) or "MORPH=empty", - out_token - ] - - -def get_iterator_and_formatter(): - tokenizer = MemorizingTokenizer() - formatter = GlueFormatter(tokenizer) - iterator = DataIterator( - tokenizer=tokenizer, - remove_from_input=DataIterator.remove_punctuation - ) - return iterator, formatter - diff --git a/pie_extended/models/fro/get.py b/pie_extended/models/fro/get.py new file mode 100644 index 0000000..4bd2a43 --- /dev/null +++ b/pie_extended/models/fro/get.py @@ -0,0 +1,21 @@ +from .processor import FroRulesProcessor, FroGlueProcessor +from .tokenizer import FroMemorizingTokenizer +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor + + +def get_iterator_and_processor(): + tokenizer = FroMemorizingTokenizer() + processor = FroRulesProcessor( + apply_on_reinsert=True, + head_processor=MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=FroGlueProcessor() + ) + ) + iterator = DataIterator( + tokenizer=tokenizer, + remove_from_input=DataIterator.remove_punctuation + ) + return iterator, processor + diff --git a/pie_extended/models/fro/processor.py b/pie_extended/models/fro/processor.py new file mode 100644 index 0000000..21e64ba --- /dev/null +++ b/pie_extended/models/fro/processor.py @@ -0,0 +1,43 @@ +import regex as re +from typing import Dict + +from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor + + +class FroRulesProcessor(RuleBasedProcessor): + """ Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically + + """ + PONCTU = re.compile(r"^\W+$") + NUMBER = re.compile(r"\d+") + PONFORT = [".", "...", "!", "?"] + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + token = annotation["form"] + if self.PONCTU.match(token): + if token in self.PONFORT: + pos = "PONfrt" + else: + pos = "PONfbl" + return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty", "treated": token} + elif self.NUMBER.match(token): + annotation["pos"] = "ADJcar" + return annotation + + def __init__(self, *args, **kwargs): + super(FroRulesProcessor, self).__init__(*args, **kwargs) + + +class FroGlueProcessor(GlueProcessor): + """ We glue morphological features into one column + + """ + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]} + MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."} + EMPTY_TAG: Dict[str, str] = {"CAS": "_", "NOMB.": "_", "DEGRE": "_", "MODE": "_", "TEMPS": "_", "GENRE": "_", + "PERS.": "_"} + + def __init__(self, *args, **kwargs): + super(FroGlueProcessor, self).__init__(*args, **kwargs) diff --git a/pie_extended/models/fro/tokenizer.py b/pie_extended/models/fro/tokenizer.py new file mode 100644 index 0000000..bac16b6 --- /dev/null +++ b/pie_extended/models/fro/tokenizer.py @@ -0,0 +1,86 @@ +import regex as re +from typing import List, Generator + +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer + +_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“" +_Dots_collections = r"[" + _Dots_except_apostrophe + "‘’]" +_RomanNumber = r"(?:M{1,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})" \ + r"(?:IX|IV|V?I{0,3})|M{0,4}(?:CM|C?D|D?C{1,3})" \ + r"(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})|M{0,4}" \ + r"(?:CM|CD|D?C{0,3})(?:XC|X?L|L?X{1,3})" \ + r"(?:IX|IV|V?I{0,3})|M{0,4}(?:CM|CD|D?C{0,3})" \ + r"(?:XC|XL|L?X{0,3})(?:IX|I?V|V?I{1,3}))" + + +class FroMemorizingTokenizer(MemorizingTokenizer): + APOSTROPHES = "'’ʼ" + re_elision_apostrophe = re.compile(r"(\w+)([" + APOSTROPHES + r"])(\w+)") + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") + re_add_space_around_apostrophe_that_are_quotes = re.compile( + r"(" + r"(((?<=[\W])[\'’ʼ]+(?=[\W]))|" + r"((?<=[\w])[\'’ʼ]+(?=[\W]))|" + r"((?<=[\W])[\'’ʼ]+(?=[\w])))|" + r"(^[\'’ʼ]+)|" + r"([\'’ʼ]+$))" + # NotLetter+Apo+NotLetter or Letter+Apo+NotLetter or NotLetter+Apo+Letter + Starting or ending apostrophe + # ?'. or manger'_ or _'Bonjour + ) + re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)") + re_remove_ending_apostrophe = re.compile(r"(?<=\w)([\'’ʼ])") + _sentence_boundaries = re.compile( + r"([" + _Dots_except_apostrophe + r"]+\s*)+" + ) + roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.") + + def __init__(self): + super(FroMemorizingTokenizer, self).__init__() + self.tokens = [] + + @staticmethod + def _sentence_tokenizer_merge_matches(match): + """ Best way we found to deal with repeating groups""" + start, end = match.span() + return match.string[start:end] + "" + + def _real_sentence_tokenizer(self, string: str) -> List[str]: + string = self._sentence_boundaries.sub(self._sentence_tokenizer_merge_matches, string) + string = string.replace("_DOT_", ".") + for index_apo, apo in enumerate(self.APOSTROPHES): + string = string.replace("ApOsTrOpHe"+str(index_apo), apo+" ") + return string.split("") + + def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + if lower: + text = text.lower() + text = text.split() + return text + + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: + sentences = list() + data = self.normalizer(text) + for sent in self._real_sentence_tokenizer(data): + sent = sent.strip() + if sent: + sentences.append(self.word_tokenizer(sent)) + yield from sentences + + def apostrophe_replace(self, regex_match) -> str: + return regex_match.group(1) + "ApOsTrOpHe"+ str(self.APOSTROPHES.index(regex_match.group(2))) + regex_match.group(3) + + def normalizer(self, data: str) -> str: + data = self.re_add_space_around_punct.sub( + r" \g<2> ", + self.re_elision_apostrophe.sub( + self.apostrophe_replace, + self.roman_number_dot.sub( + r"_DOT_\g<1>_DOT_", + data + ) + ) + ) + return data + + def replacer(self, inp: str): + return self.re_remove_ending_apostrophe.sub("", inp) diff --git a/pie_extended/models/lasla/__init__.py b/pie_extended/models/lasla/__init__.py index 05f68de..512e9f5 100644 --- a/pie_extended/models/lasla/__init__.py +++ b/pie_extended/models/lasla/__init__.py @@ -1,2 +1,2 @@ from pie_extended.models.lasla.consts import DOWNLOADS, Models, Disambiguator, addons, DESC -from pie_extended.models.lasla.classes import get_iterator_and_formatter +from pie_extended.models.lasla.get import get_iterator_and_processor diff --git a/pie_extended/models/lasla/classes.py b/pie_extended/models/lasla/classes.py deleted file mode 100644 index 7276346..0000000 --- a/pie_extended/models/lasla/classes.py +++ /dev/null @@ -1,106 +0,0 @@ -import sys -import regex as re -import click - -try: - import cltk - from cltk.tokenize.word import WordTokenizer -except ImportError as E: - click.echo(click.style("You need to install cltk and its Latin Data to runs this package", fg="red")) - click.echo("pip install cltk") - click.echo("pie-ext install-addons lasla") - sys.exit(0) - - -from pie_extended.pipeline.iterators.proto import DataIterator -from pie_extended.pipeline.formatters.glue import GlueFormatter as SourceGlueFormatter -from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer - - -# Uppercase regexp -uppercase = re.compile(r"^[A-Z]$") - - -class MemorizingTokenizer(SourceMemorizingTokenizer): - - re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\.])(\s*)") - re_normalize_space = re.compile(r"(\s+)") - re_sentence_tokenizer = re.compile(r"([_||[^\s\w]]+(?:[\s_||[\W]]+)?)", re.VERSION1) - - def __init__(self): - self.tokens = [ - ] - - self._word_tokenizer = WordTokenizer("latin") - - def word_tokenizer(self, data): - return self._word_tokenizer.tokenize(data) - - def sentence_tokenizer(self, data): - sentences = list() - first_is_dot = False - started_writting = False # Allows for avoiding to compute length - for sent in MemorizingTokenizer.re_sentence_tokenizer.split(data): - sent = sent.strip() - if sent: - if MemorizingTokenizer.re_sentence_tokenizer.match(sent): - if not started_writting: - sentences.append(sent) - first_is_dot = True - else: - sentences[-1] += " " + sent - else: - if first_is_dot: - sentences[-1] += " " + sent - first_is_dot = False - else: - sentences.append(sent) - - if not started_writting and len(sentences): - started_writting = True - - yield from sentences - - def replacer(self, inp: str): - inp = inp.replace("U", "V").replace("v", "u").replace("J", "I").replace("j", "i").lower() - return inp - - def normalizer(self, data: str): - # Fix regarding the current issue of apostrophe - # https://github.com/cltk/cltk/issues/925#issuecomment-522065530 - # On the other hand, it creates empty tokens... - data = MemorizingTokenizer.re_add_space_around_punct.sub(" \g<2> ", data) - data = MemorizingTokenizer.re_normalize_space.sub(" ", data) - return data - - -class GlueFormatter(SourceGlueFormatter): - HEADERS = ["form", "lemma", "POS", "morph", "treated_token"] - MORPH_PART = ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"] - PONCTU = re.compile(r"^\W+$") - - def __init__(self, tokenizer_memory): - super(GlueFormatter, self).__init__([]) - self.tokenizer_memory = tokenizer_memory - - def rule_based(cls, token): - if cls.PONCTU.match(token): - return [token, token, "PUNC", "MORPH=empty", token] - elif token.startswith("-"): - if token == "-ne": - lemma = "ne2" - else: - lemma = token[1:] - return [token, lemma, "CONcoo", "MORPH=empty", token] - - return None - - -def get_iterator_and_formatter(): - tokenizer = MemorizingTokenizer() - formatter = GlueFormatter(tokenizer) - iterator = DataIterator( - tokenizer=tokenizer, - remove_from_input=DataIterator.remove_punctuation - ) - return iterator, formatter diff --git a/pie_extended/models/lasla/get.py b/pie_extended/models/lasla/get.py new file mode 100644 index 0000000..3c6e582 --- /dev/null +++ b/pie_extended/models/lasla/get.py @@ -0,0 +1,25 @@ +import regex as re + +from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor +from pie_extended.models.lasla.tokenizer import LatMemorizingTokenizer +from pie_extended.pipeline.iterators.proto import DataIterator +from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor + +# Uppercase regexp +uppercase = re.compile(r"^[A-Z]$") + + +def get_iterator_and_processor(): + tokenizer = LatMemorizingTokenizer() + processor = LatinRulesProcessor( + apply_on_reinsert=True, + head_processor=MemoryzingProcessor( + tokenizer_memory=tokenizer, + head_processor=LatinGlueProcessor() + ) + ) + iterator = DataIterator( + tokenizer=tokenizer, + remove_from_input=DataIterator.remove_punctuation + ) + return iterator, processor diff --git a/pie_extended/models/lasla/processor.py b/pie_extended/models/lasla/processor.py new file mode 100644 index 0000000..8af49bd --- /dev/null +++ b/pie_extended/models/lasla/processor.py @@ -0,0 +1,39 @@ +import regex as re +from typing import Dict + +from pie_extended.pipeline.postprocessor.glue import GlueProcessor +from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor + + +class LatinRulesProcessor(RuleBasedProcessor): + """ Lasla data has no punctuation, we tag it automatically. + + "ne" token can be two different lemma, but I don't remember why I wrote this part. (ne/nec ?) + + """ + PONCTU = re.compile(r"^\W+$") + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + # If Else condition + token = annotation["form"] + if self.PONCTU.match(token): + return {"form": token, "lemma": token, "pos": "PUNC", "morph": "MORPH=empty", "treated": token} + elif token.startswith("-"): + if token == "-ne": + annotation["lemma"] = "ne2" + else: + annotation["lemma"] = "ne" + return annotation + + def __init__(self, *args, **kwargs): + super(LatinRulesProcessor, self).__init__(*args, **kwargs) + + +class LatinGlueProcessor(GlueProcessor): + OUTPUT_KEYS = ["form", "lemma", "POS", "morph"] + GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} + WHEN_EMPTY = {"morph": "MORPH=empty"} + MAP = {"pos": "POS"} + + def __init__(self, *args, **kwargs): + super(LatinGlueProcessor, self).__init__(*args, **kwargs) \ No newline at end of file diff --git a/pie_extended/models/lasla/tokenizer.py b/pie_extended/models/lasla/tokenizer.py new file mode 100644 index 0000000..14f6d29 --- /dev/null +++ b/pie_extended/models/lasla/tokenizer.py @@ -0,0 +1,70 @@ +import regex as re +import click +import sys +from typing import List, Generator + +from pie_extended.models.fro.tokenizer import _Dots_except_apostrophe, _RomanNumber +from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer + +try: + import cltk + from cltk.tokenize.word import WordTokenizer +except ImportError as E: + click.echo(click.style("You need to install cltk and its Latin Data to runs this package", fg="red")) + click.echo("pip install cltk") + click.echo("pie-extended install-addons lasla") + sys.exit(0) + + +class LatMemorizingTokenizer(MemorizingTokenizer): + re_add_space_around_punct = re.compile(r"(\s*)([^\w\s])(\s*)") + _sentence_boundaries = re.compile( + r"([" + _Dots_except_apostrophe + r"]+\s*)+" + ) + roman_number_dot = re.compile(r"\.(" + _RomanNumber + r")\.") + + def __init__(self): + super(LatMemorizingTokenizer, self).__init__() + self.tokens = [] + self._word_tokenizer = WordTokenizer("latin") + + @staticmethod + def _sentence_tokenizer_merge_matches(match): + """ Best way we found to deal with repeating groups""" + start, end = match.span() + return match.string[start:end] + "" + + @classmethod + def _real_sentence_tokenizer(cls, string: str) -> List[str]: + string = cls._sentence_boundaries.sub(cls._sentence_tokenizer_merge_matches, string) + string = string.replace("_DOT_", ".") + return string.split("") + + def _real_word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + tokenized = [tok for tok in self._word_tokenizer.tokenize(text) if tok] + if tokenized: + tokenized = [tok.lower() for tok in tokenized] + return tokenized + + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: + sentences = list() + data = self.normalizer(text) + for sent in self._real_sentence_tokenizer(data): + sent = sent.strip() + if sent: + sentences.append(self.word_tokenizer(sent)) + yield from sentences + + def normalizer(self, data: str) -> str: + data = self.re_add_space_around_punct.sub( + r" \g<2> ", + self.roman_number_dot.sub( + r"_DOT_\g<1>_DOT_", + data + ) + ) + return data + + def replacer(self, inp: str): + inp = inp.replace("V", "U").replace("v", "u").replace("J", "I").replace("j", "i") + return inp diff --git a/pie_extended/pipeline/formatters/glue.py b/pie_extended/pipeline/formatters/glue.py deleted file mode 100644 index b1024dd..0000000 --- a/pie_extended/pipeline/formatters/glue.py +++ /dev/null @@ -1,60 +0,0 @@ -import regex as re -from .proto import Formatter - - -class GlueFormatter(Formatter): - """ Need replacing of morph_part for specific corpora - - """ - - HEADERS = ["form", "lemma", "POS", "morph", "treated_token"] - MORPH_PART = ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"] - PONCTU = re.compile(r"^\W+$") - - def __init__(self, tokenizer_memory): - super(GlueFormatter, self).__init__([]) - self.tokenizer_memory = tokenizer_memory - - def __call__(self, tasks): - super(GlueFormatter, self).__init__(tasks) - self.pos_tag = "POS" - if "POS" not in self.tasks and "pos" in self.tasks: - self.pos_tag = "pos" - return self - - @classmethod - def get_headers(cls): - return cls.HEADERS - - def rule_based(cls, token): - if cls.PONCTU.match(token): - return [token, token, "PUNC", "MORPH=empty", token] - - return None - - def format_line(self, token, tags, ignored=False): - tags = list(tags) - lemma = tags[self.tasks.index("lemma")] - index, input_token, out_token = self.tokenizer_memory.tokens.pop(0) - if token != out_token: - raise Exception("The output token does not match our inputs %s : %s" % (token, out_token)) - - overwriten = self.rule_based(token) - if overwriten: - return overwriten - - return [ - input_token, - lemma, - tags[self.tasks.index(self.pos_tag)], - "|".join( - "{cat}={tag}".format( - cat=morph_part, - tag=tags[self.tasks.index(morph_part)] - ) - for morph_part in type(self).MORPH_PART - if morph_part in self.tasks and - tags[self.tasks.index(morph_part)] != "_" - ) or "MORPH=empty", - out_token - ] diff --git a/pie_extended/pipeline/formatters/proto.py b/pie_extended/pipeline/formatters/proto.py index f4b2c2d..ac69acc 100644 --- a/pie_extended/pipeline/formatters/proto.py +++ b/pie_extended/pipeline/formatters/proto.py @@ -1,13 +1,30 @@ -from typing import List, Iterable +from typing import List, Iterable, Callable, Dict +import sys class Formatter: # Default is TSV + """ The CSV formatter necessarily starts with form in its header. + + """ + format_line: Callable[[Dict[str, str]], List[str]] + def __init__(self, tasks: List[str]): self.tasks: List[str] = tasks - def format_line(self, token: str, tags: Iterable[str], ignored=False) -> List[str]: - """ Format the tags""" - return [token] + list(tags) + if sys.version_info.minor <= 6: + # Before 3.7, order of dictionary is not guaranteed + # Cf. https://mail.python.org/pipermail/python-dev/2017-December/151283.html + self.format_line = self.format_line_3_6 + else: + self.format_line = self.format_line_3_7 + + def format_line_3_6(self, annotation: Dict[str, str]) -> List[str]: + """ Format the tags """ + return [annotation["form"]] + [annotation[task] for task in self.tasks] + + def format_line_3_7(self, annotation: Dict[str, str]) -> List[str]: + """ Format the tags """ + return list(annotation.values()) def write_line(self, formatted): return "\t".join(formatted) + "\r\n" diff --git a/pie_extended/pipeline/iterators/proto.py b/pie_extended/pipeline/iterators/proto.py index 8229ec5..89d0bae 100644 --- a/pie_extended/pipeline/iterators/proto.py +++ b/pie_extended/pipeline/iterators/proto.py @@ -1,23 +1,22 @@ import regex as re -import string from pie.tagger import simple_tokenizer from typing import Callable, List, Tuple, Dict, Union, Iterable -from ...pipeline.tokenizers.classes import Tokenizer from ...utils import ObjectCreator +from ..tokenizers.simple_tokenizer import SimpleTokenizer Remover = Callable[[List[str]], Tuple[List[str], Dict[int, str]]] PUNKT = re.compile(r"^[_||[^\s\w]]+$", re.VERSION1) class DataIterator: - def __init__(self, tokenizer: Union[ObjectCreator, Tokenizer] = None, remove_from_input: Callable = None): + def __init__(self, tokenizer: SimpleTokenizer = None, remove_from_input: Callable = None): """ Iterator used to parse the text and returns bits to tag :param tokenizer: Tokenizer """ - self.tokenizer = tokenizer or simple_tokenizer + self.tokenizer: SimpleTokenizer = tokenizer or SimpleTokenizer() self.remove_from_input = remove_from_input if self.remove_from_input is None: self.remove_from_input = lambda x: (x, {}) @@ -41,12 +40,6 @@ def remove_punctuation(sentence: List[str]) -> Tuple[List[str], Dict[int, str]]: clean.append(token) return clean, removed - def get_tokenizer(self) -> Tokenizer: - """ Get the tokenizer if it needs to be created""" - if isinstance(self.tokenizer, ObjectCreator): - return self.tokenizer.create() - return self.tokenizer - def get_remover(self) -> Remover: if isinstance(self.remove_from_input, ObjectCreator): return self.remove_from_input.create() @@ -60,8 +53,7 @@ def __call__(self, data: str, lower: bool = False) -> Iterable[Tuple[List[str], :param lower: Whether or not to lower the text :yields: (Sentence as a list of word, Size of the sentence, Elements removed from the sentence) """ - tokenizer = self.get_tokenizer() remover = self.get_remover() - for sentence in tokenizer(data, lower=lower): + for sentence in self.tokenizer.sentence_tokenizer(data, lower=lower): clean_sentence, removed_from_input = remover(sentence) yield clean_sentence, len(clean_sentence), removed_from_input diff --git a/pie_extended/pipeline/postprocessor/__init__.py b/pie_extended/pipeline/postprocessor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pie_extended/pipeline/postprocessor/disambiguator.py b/pie_extended/pipeline/postprocessor/disambiguator.py new file mode 100644 index 0000000..79d0895 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/disambiguator.py @@ -0,0 +1,22 @@ +from ..disambiguators.proto import Disambiguator +from .proto import ProcessorPrototype, ChainedProcessor +from typing import Optional, Dict, List + + +# Right now disambiguation is applied at the sentence level. Question is should we ? +# Keeping that here for the moment + +class DisambiguatorProcessor(ChainedProcessor): + """ Applies rules found in rules(token_annotation) + + """ + + def __init__(self, disambiguator: Disambiguator, head_processor: Optional[ProcessorPrototype], **kwargs): + super(DisambiguatorProcessor, self).__init__(head_processor=head_processor, **kwargs) + self.disambiguator: Disambiguator = disambiguator + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + return annotation + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + return self.rules(self.head_processor.get_dict(token, tags)) \ No newline at end of file diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py new file mode 100644 index 0000000..76cbf66 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/glue.py @@ -0,0 +1,76 @@ +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, RenamedTaskProcessor +from typing import Generator, Dict, List + + +class GlueProcessor(RenamedTaskProcessor): + """ Glues together specific tasks + + >>> class SimpleGlue(GlueProcessor): + ... OUTPUT_KEYS = ["form", "lemma", "task3"] + ... GLUE = {"task3": ["1", "2"]} # Merges Task `1` output and task `2` output in `task3` + ... EMPTY_TAG = {"1": "_", "2": "_"} # If _ is tagged in task `1`, it's the same as an empty tag + ... GLUE_EMPTY = {"task3": "NO-DATA"} # When all merged data are empty, default value + >>> x = SimpleGlue() + >>> x.set_tasks(["lemma", "1", "2"]) + >>> # Merges b and c values from task 1 and 2 into a new task + >>> x.get_dict("a", ["a", "b", "c"]) == {"form": "a", "lemma": "a", "task3": "1=b|2=c"} + True + >>> # Keeps only one task because 2 is empty + >>> x.get_dict("a", ["a", "b", "_"]) == {"form": "a", "lemma": "a", "task3": "1=b"} + True + >>> # Fills with the default empty tag because both task 1 and 2 were empty + >>> x.get_dict("a", ["a", "_", "_"]) == {"form": "a", "lemma": "a", "task3": "NO-DATA"} + True + + """ + + # Output keys are keys that are given in the end + OUTPUT_KEYS: List[str] = ["form", "lemma", "POS", "morph"] + # Glue dicts contains tasks that should merge together subtasks + GLUE: Dict[str, List[str]] = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]} + # Glue_char is what is used to glue things together -> Tense=Pres|Person=1 + GLUE_CHAR: str = "|" + # Glue Empty are value to take when all things glued together are empty + GLUE_EMPTY: Dict[str, str] = {"morph": "MORPH=empty"} + # Value that means the current element is empty + EMPTY_TAG: Dict[str, str] = {"Case": "_", "Numb": "_", "Deg": "_", "Mood": "_", "Tense": "_", "Voice": "_", + "Person": "_"} + + def __init__(self, *args, **kwargs): + super(GlueProcessor, self).__init__(*args, **kwargs) + + # Sets-up some copy of the values + self._out = self.OUTPUT_KEYS + self._glue = self.GLUE + self._glue_char = self.GLUE_CHAR + self._glue_empty = self.GLUE_EMPTY + self._empty_tags = self.EMPTY_TAG + + def set_tasks(self, tasks): + super(GlueProcessor, self).set_tasks(tasks) + + def _yield_annotation( + self, + token_dict: Dict[str, str] + ) -> Generator[str, None, None]: + # For each key we should return + for head in self._out: + if head not in self._glue: + yield head, token_dict[head] + else: + # Otherwise, we glue together things that should be glued together + joined = self._glue_char.join([ + glued_task + "=" + token_dict[glued_task] + for glued_task in self._glue[head] + if token_dict[glued_task] != self._empty_tags.get(glued_task, -1) + ]) + if not joined: + joined = self._glue_empty[head] + yield head, joined + + def reinsert(self, form: str) -> Dict[str, str]: + return dict(form=form, **{key: self.empty_value for key in self._out if key != "form"}) + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + as_dict = super(GlueProcessor, self).get_dict(token, tags) + return dict(self._yield_annotation(as_dict)) diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py new file mode 100644 index 0000000..618970e --- /dev/null +++ b/pie_extended/pipeline/postprocessor/memory.py @@ -0,0 +1,57 @@ +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor +from typing import Optional, Dict, List +if "typing" == "nottyping": + from ..tokenizers.memorizing import MemorizingTokenizer + + +class MemoryzingProcessor(ChainedProcessor): + """ MemoryzingProcessor proposes to keep track of changes operated on input string + by reinserting the original data alongside a new task (KEY) where we output + the input seen by the Model + + It reuses the memory from a class derived from MemorizingTokenizer so that it reintroduced + the original input into the token. + + >>> from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer + >>> tokenizer = MemorizingTokenizer() + >>> # Fake token memory : (Index, Original Input, Input seen by Tagger) + >>> tokenizer.tokens = [(0, "A", "a"), (0, "b", "b"), (0, "q'", "q")] + >>> processor = MemoryzingProcessor(tokenizer_memory=tokenizer, head_processor=ProcessorPrototype()) + >>> processor.set_tasks(["lem"]) + >>> # Lowercase a was taken in the input but uppercase a is returned in form. For transparency, input seen + >>> # By the tagger is returned in a new column, treated (cf. MemorizingProcessor.KEY) + >>> processor.get_dict("a", ["lemma"]) == {"form": "A", "treated": "a", "lem": "lemma"} + True + >>> # Some would have the same treated and input + >>> processor.get_dict("b", ["lemma"]) == {"form": "b", "treated": "b", "lem": "lemma"} + True + >>> # Some differ with more characters + >>> processor.get_dict("q", ["lemma"]) == {"form": "q'", "treated": "q", "lem": "lemma"} + True + + This allows for easier output alignment as well as removing unknown characters to the model. If your lemmatizer + in training has never seen the "@" character, you can remove it at tokenization time and reinsert it with + MemoryzingProcessor + + """ + KEY: str = "treated" + + def __init__(self, tokenizer_memory: "MemorizingTokenizer", head_processor: Optional[ProcessorPrototype], **kwargs): + super(MemoryzingProcessor, self).__init__(head_processor=head_processor, **kwargs) + self.memory: "MemorizingTokenizer" = tokenizer_memory + self._key: str = type(self).KEY + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + # First we get the dictionary + token_dict = self.head_processor.get_dict(token, tags) + index, input_token, out_token = self.memory.tokens.pop(0) + if token != out_token: + raise Exception("The output token does not match our inputs %s : %s" % (token, out_token)) + + token_dict[self._key] = out_token + token_dict["form"] = input_token + return token_dict + + def reinsert(self, form: str) -> Dict[str, str]: + self.memory.tokens.pop(0) + return super(MemoryzingProcessor, self).reinsert(form) diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py new file mode 100644 index 0000000..81dbcb3 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/proto.py @@ -0,0 +1,148 @@ +from typing import List, Dict, Optional, Type + +DEFAULT_EMPTY = "_" + + +class ProcessorPrototype: + tasks: List[str] + empty_value: str + + def __init__(self, empty_value: Optional[str] = None): + """ Applies postprocessing. Simplest Processor one could use. + + :param empty_value: Value to use to fill tasks that would not get any data + + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"} + True + >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"} + True + """ + self.tasks = [] + self.empty_value = empty_value or DEFAULT_EMPTY + + def set_tasks(self, tasks): + self.tasks = tasks + + def postprocess(self, line): + pass + + def reinsert(self, form: str) -> Dict[str, str]: + """ Generates an automatic line for a token that was removed from lemmatization + + :param form: Token to reinsert + :return: Dictionary representation of the token, as an annotation + + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"} + True + """ + return dict(form=form, **{task: self.empty_value for task in self.tasks}) + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + """ Get the dictionary representation of a token annotation + + :param token: Token used as input for pie + :param tags: List of tags generated + :return: Dictionary representation of the token and its annotations + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"} + True + """ + return {"form": token, **{k: val for k, val in zip(self.tasks, tags)}} + + def reset(self): + """ Functions that should be run in between documents + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> x.reset() + """ + pass + + +class RenamedTaskProcessor(ProcessorPrototype): + MAP: Dict[str, str] = {} + + def __init__(self, **kwargs): + """ This Processor is used for renaming tasks (Pie for example refuses tasks containing dots) + + >>> class ExampleRemaped(RenamedTaskProcessor): + ... MAP = {"task_name_1": "renamed"} + >>> x = ExampleRemaped() + >>> x.set_tasks(["task_name_1", "y"]) + >>> x.get_dict("token", ["a", "b"]) == {"form": "token", "renamed": "a", "y": "b"} + True + """ + super(RenamedTaskProcessor, self).__init__(**kwargs) + self._map: Dict[str, str] = type(self).MAP + + def set_tasks(self, tasks): + self.tasks = [self._map.get(task, task) for task in tasks] + + +class ChainedProcessor(ProcessorPrototype): + """ Allows for easy chaining ! + + The ChainedProcessor is basically using its headprocessor in the background and checking it's output to some extent + + The prototype of ChainedProcessor using Processor Prototype would have the same results because + chained processor is not doing anything new except enabling chaining + + >>> x = ProcessorPrototype(empty_value="%") + >>> x.set_tasks(["a", "b"]) + >>> y = ChainedProcessor(x) + >>> y.set_tasks(["a", "b"]) + >>> x.reinsert("x") == y.reinsert("x") + True + >>> x.get_dict("y", ["1", "2"]) == y.get_dict("y", ["1", "2"]) + True + + You can subclass it to modify the output of the preceding processor : + + >>> class ExampleChained(ChainedProcessor): + ... def reinsert(self, form: str) -> Dict[str, str]: + ... annotation = self.head_processor.reinsert(form) + ... annotation["col3"] = "x" + ... return annotation + ... + ... def get_dict(self, form: str, tags: List[str]) -> Dict[str, str]: + ... annotation = self.head_processor.get_dict(form, tags) + ... annotation["col3"] = "x" + ... return annotation + ... + >>> x = ExampleChained(ProcessorPrototype(empty_value="EMPTY")) + >>> x.set_tasks(["a", "b"]) + >>> x.reinsert("x") == {"form": "x", "a": "EMPTY", "b": "EMPTY", "col3": "x"} + True + >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2", "col3": "x"} + True + + """ + head_processor: ProcessorPrototype + + def __init__(self, head_processor: Optional[ProcessorPrototype], **kwargs): + super(ChainedProcessor, self).__init__(**kwargs) + + self.head_processor: ProcessorPrototype = head_processor + if not self.head_processor: + self.head_processor = ProcessorPrototype() + + def set_tasks(self, tasks): + super(ChainedProcessor, self).set_tasks(tasks) + self.head_processor.set_tasks(tasks) + + def reinsert(self, form: str) -> Dict[str, str]: + return self.head_processor.reinsert(form) + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + return self.head_processor.get_dict(token, tags) + + def reset(self): + self.head_processor.reset() diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py new file mode 100644 index 0000000..0977342 --- /dev/null +++ b/pie_extended/pipeline/postprocessor/rulebased.py @@ -0,0 +1,43 @@ +from pie_extended.pipeline.postprocessor.proto import ProcessorPrototype, ChainedProcessor +from typing import Optional, Dict, List +if "typing" == "nottyping": + from ..tokenizers.memorizing import MemorizingTokenizer + + +class RuleBasedProcessor(ChainedProcessor): + """ Applies rules found in rules(token_annotation) + + """ + + def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[ProcessorPrototype] = None, **kwargs): + """ Apply rules on output of the taggers + + :param apply_on_reinsert: Apply rules on reinsert task + :param head_processor: Processor to use before post-processing its results + + >>> class ExampleRule(RuleBasedProcessor): + ... def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + ... if annotation["form"] == "need": + ... annotation["1"] = "REPLACED" + ... return annotation + >>> processor = ExampleRule() + >>> processor.set_tasks(["1", "2"]) + >>> processor.get_dict("token", ["a", "b"]) == {"form": "token", "1": "a", "2": "b"} + True + >>> processor.get_dict("need", ["a", "b"]) == {"form": "need", "1": "REPLACED", "2": "b"} + True + """ + super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs) + self.apply_on_reinsert = apply_on_reinsert + + def rules(self, annotation: Dict[str, str]) -> Dict[str, str]: + return annotation + + def reinsert(self, form: str) -> Dict[str, str]: + anno = super(RuleBasedProcessor, self).reinsert(form) + if self.apply_on_reinsert: + return self.rules(anno) + return anno + + def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]: + return self.rules(self.head_processor.get_dict(token, tags)) \ No newline at end of file diff --git a/pie_extended/pipeline/tokenizers/memorizing.py b/pie_extended/pipeline/tokenizers/memorizing.py index b338529..7940e80 100644 --- a/pie_extended/pipeline/tokenizers/memorizing.py +++ b/pie_extended/pipeline/tokenizers/memorizing.py @@ -1,45 +1,32 @@ -class MemorizingTokenizer(object): +from .simple_tokenizer import SimpleTokenizer +from typing import List, Tuple, Dict + + +class MemorizingTokenizer(SimpleTokenizer): """ Tokenizer that memoryze what it tokenized. Mostly used to normalized input as input time and then reinserting normalized input """ - @staticmethod - def _sentence_tokenizer(string): - for s in string.split("."): - if s.strip(): - yield s.strip() + " " + "." - - @staticmethod - def _word_tokenizer(string): - for s in string.split(): - if s.strip: - yield s.strip() - - @staticmethod - def _replacer(inp: str): - return inp - - def __init__(self, sentence_tokenizer=None, word_tokenizer=None, replacer=None, normalizer=None): - self.tokens = [ - ] - - self.sentence_tokenizer = sentence_tokenizer or self._sentence_tokenizer - self.word_tokenizer = word_tokenizer or self._word_tokenizer - self.replacer = replacer or self._replacer - self.normalizer = normalizer or self._replacer - - def __call__(self, data, lower=True): - if lower: - data = data.lower() - for sentence in self.sentence_tokenizer(data): - toks = self.word_tokenizer(sentence) - new_sentence = [] - - for tok in toks: - if tok: - out = self.replacer(tok) - self.tokens.append((len(self.tokens), tok, out)) - new_sentence.append(out) - if new_sentence: - yield new_sentence + + def replacer(self, token: str) -> str: + """ This function allows for changing input and keeping it in memory """ + return token + + def __init__(self): + self.tokens: List[Tuple[int, str, str]] = [] + + def _real_word_tokenizer(self, data: str, lower: bool = False) -> List[str]: + return super(MemorizingTokenizer, self).word_tokenizer(data, lower=lower) + + def word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + sentence = [] + for token in self._real_word_tokenizer(text, lower): + out = self.replacer(token) + self.tokens.append((len(self.tokens), token, out)) + sentence.append(out) + return sentence + + def reset(self): # Empty + self.tokens = [] + diff --git a/pie_extended/pipeline/tokenizers/simple_tokenizer.py b/pie_extended/pipeline/tokenizers/simple_tokenizer.py new file mode 100644 index 0000000..ef633a9 --- /dev/null +++ b/pie_extended/pipeline/tokenizers/simple_tokenizer.py @@ -0,0 +1,33 @@ +from typing import Generator, List +import regex as re +import string +from pie.tagger import regexsplitter, SECTION, FULLSTOP + +WORD = r'([{}])'.format(string.punctuation) + + +class SimpleTokenizer(object): + """ Tokenizer that memoryze what it tokenized. + + Mostly used to normalized input as input time and then reinserting normalized input + + """ + def __init__(self): + self.section = regexsplitter(SECTION) + self.fullstop = regexsplitter(FULLSTOP) + self.word = regexsplitter(WORD) + + def sentence_tokenizer(self, text: str, lower: bool = False) -> Generator[List[str], None, None]: + for line in self.section(text): + for sentence in self.fullstop(line): + yield self.word_tokenizer(sentence, lower=lower) + + def word_tokenizer(self, text: str, lower: bool = False) -> List[str]: + sentence = [w for raw in text.split() for w in self.word(raw)] + if lower: + sentence = [w.lower() for w in sentence] + return sentence + + def reset(self): + """Can be used between documents for example """ + pass diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py index 8c2dc9d..604e401 100644 --- a/pie_extended/tagger.py +++ b/pie_extended/tagger.py @@ -1,5 +1,5 @@ import os -from typing import Optional +from typing import Optional, Dict, Generator, Type from pie.tagger import Tagger from pie import utils @@ -7,6 +7,7 @@ from .pipeline.formatters.proto import Formatter from .pipeline.disambiguators.proto import Disambiguator from .pipeline.iterators.proto import DataIterator +from .pipeline.postprocessor.proto import ProcessorPrototype class ExtensibleTagger(Tagger): @@ -18,19 +19,7 @@ def __init__(self, device='cpu', batch_size=100, lower=False, disambiguation=Non ) self.disambiguation: Optional[Disambiguator] = disambiguation - def reinsert_full(self, formatter, sent_reinsertion, tasks): - yield formatter.write_sentence_beginning() - # If a sentence is empty, it's most likely because everything is in sent_reinsertions - for reinsertion in sorted(list(sent_reinsertion.keys())): - yield formatter.write_line( - formatter.format_line( - token=sent_reinsertion[reinsertion], - tags=[""] * len(tasks) - ) - ) - yield formatter.write_sentence_end() - - def tag_file(self, fpath: str, iterator: DataIterator, formatter_class: type): + def tag_file(self, fpath: str, iterator: DataIterator, processor: ProcessorPrototype): # Read content of the file with open(fpath) as f: data = f.read() @@ -38,16 +27,18 @@ def tag_file(self, fpath: str, iterator: DataIterator, formatter_class: type): _, ext = os.path.splitext(fpath) with open(utils.ensure_ext(fpath, ext, 'pie'), 'w+') as f: - for line in self.iter_tag(data, iterator, formatter_class): + for line in self.iter_tag(data, iterator, processor=processor): f.write(line) - def tag_str(self, data: str, iterator: DataIterator, formatter_class: type) -> str: - return "".join(list(self.iter_tag(data, iterator, formatter_class))) - - def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): - header = False - formatter = None + def tag_str(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) -> str: + return list(self.iter_tag_token(data, iterator, processor=processor)) + def iter_tag_token(self, data: str, iterator: DataIterator, processor: ProcessorPrototype) \ + -> Generator[Dict[str, str], None, None]: + # Reset at each document + processor.reset() + iterator.tokenizer.reset() + # Iterate ! for chunk in utils.chunks( iterator(data, lower=self.lower), size=self.batch_size): @@ -55,13 +46,13 @@ def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): # to be reinserted sents, lengths, needs_reinsertion = zip(*chunk) - is_empty = [0 == len(sent) for sent in enumerate(sents)] - + is_empty = [not bool(sent) for sent in sents] tagged, tasks = self.tag( sents=[sent for sent in sents if sent], lengths=lengths ) - formatter: Formatter = formatter_class(tasks) + if not processor.tasks: + processor.set_tasks(tasks) # We keep a real sentence index for sents_index, sent_is_empty in enumerate(is_empty): @@ -73,13 +64,6 @@ def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): # Gets things that needs to be reinserted sent_reinsertion = needs_reinsertion[sents_index] - # If the header has not yet be written, write it - if not header: - yield formatter.write_headers() - header = True - - yield formatter.write_sentence_beginning() - # If we have a disambiguator, we run the results into it if self.disambiguation: sent = self.disambiguation(sent, tasks) @@ -87,30 +71,25 @@ def iter_tag(self, data: str, iterator: DataIterator, formatter_class: type): reinsertion_index = 0 for index, (token, tags) in enumerate(sent): + # Before current index while reinsertion_index + index in sent_reinsertion: - yield formatter.write_line( - formatter.format_line( - token=sent_reinsertion[reinsertion_index + index], - tags=[""] * len(tasks) - ) - ) + yield processor.reinsert(sent_reinsertion[reinsertion_index+index]) del sent_reinsertion[reinsertion_index + index] reinsertion_index += 1 - yield formatter.write_line( - formatter.format_line(token, tags) - ) + yield processor.get_dict(token, tags) for reinsertion in sorted(list(sent_reinsertion.keys())): - yield formatter.write_line( - formatter.format_line( - token=sent_reinsertion[reinsertion], - tags=[""] * len(tasks) - ) - ) + yield processor.reinsert(sent_reinsertion[reinsertion]) - yield formatter.write_sentence_end() + def iter_tag(self, data: str, iterator: DataIterator, processor: type): + formatter = None + for annotation in self.iter_tag_token(data, iterator, processor): + if not formatter: + formatter = Formatter(list(annotation.keys())) + yield formatter.write_headers() + yield formatter.write_line(formatter.format_line(annotation)) if formatter: - yield formatter.write_footer() + yield formatter.write_footer() \ No newline at end of file diff --git a/tests/test_models/test_fro.py b/tests/test_models/test_fro.py new file mode 100644 index 0000000..16d42ea --- /dev/null +++ b/tests/test_models/test_fro.py @@ -0,0 +1,49 @@ +from pie_extended.models.fro.get import get_iterator_and_processor +from pie_extended.testing_utils import FakeTagger +from typing import List, Tuple + +from unittest import TestCase +from .test_lasla import make_fake_data + + +def make_controller(sentences: List[str]): + # Add the lemmatizer routes + tagger = FakeTagger( + make_fake_data(sentences), + tasks="lemma,MODE,TEMPS,PERS,NOMB,GENRE,CAS,DEGRE,POS".split(",") + ) + iterator, processor = get_iterator_and_processor() + return tagger, iterator, processor + + +class TestFro(TestCase): + def test_elision_apostrophe(self): + string = "q'il meurt" + treated = ["q il meurt"] + tagger, it, pro = make_controller(treated) + out = tagger.tag_str(string, it, pro) + self.assertEqual(out[0]["form"], "q'") + self.assertEqual(out[0]["treated"], "q") + + def test_elision_apostrophe_and_quote(self): + string = "'q'il meurt 'dit il'" + treated = ["q il meurt dit il"] + tagger, it, pro = make_controller(treated) + out = tagger.tag_str(string, it, pro) + self.assertEqual(out[0]["form"], "'") + self.assertEqual(out[0]["treated"], "'") + self.assertEqual(out[1]["form"], "q'") + self.assertEqual(out[1]["treated"], "q") + self.assertEqual(out[-1]["form"], "'", "Last apostrophe is kept") + # Ending and starting apostrophe are not reinserted for some reason. + + def test_tokenization_roman_number(self): + iterator, _ = get_iterator_and_processor() + self.assertEqual( + list(iterator.tokenizer.sentence_tokenizer("Les .XIII. tables du Duc du XII.. C'est fantastique")), + [ + ["Les", ".XIII.", "tables", "du", "Duc", "du", "XII", ".", "."], + ["C", 'est', "fantastique"] + ], + "Dots around roman number are not sentences markers" + ) \ No newline at end of file diff --git a/tests/test_models/test_lasla.py b/tests/test_models/test_lasla.py index 2a5cac5..1359afc 100644 --- a/tests/test_models/test_lasla.py +++ b/tests/test_models/test_lasla.py @@ -1,4 +1,4 @@ -from pie_extended.models.lasla.classes import get_iterator_and_formatter +from pie_extended.models.lasla.get import get_iterator_and_processor from pie_extended.testing_utils import FakeTagger from typing import List, Tuple @@ -21,18 +21,18 @@ def make_controller(sentences: List[str]): make_fake_data(sentences), tasks="lemma,Voice,Mood,Deg,Numb,Person,Tense,Case,Gend,pos".split(",") ) - iterator, formatter = get_iterator_and_formatter() - return tagger, iterator, formatter + iterator, processor = get_iterator_and_processor() + return tagger, iterator, processor -class TestPonctuation(TestCase): +class TestLasla(TestCase): def test_consecutive_dots(self): """Check that consecutive punctation does not break anything Found out the hard way it would break things """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait turbabuntur a facie eius patris or phanorum et iudicis uiduarum", "causam turbationis hanc docuit quod pater" ]) @@ -40,15 +40,14 @@ def test_consecutive_dots(self): result = tagger.tag_str( data="id enim ait turbabuntur a facie eius patris or phanorum et iudicis uiduarum ." " . causam turbationis hanc docuit quod pater", - formatter_class=formatter, + processor=processor, iterator=data_iterator ) - self.assertIn( - "uiduarum uiduarum fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake" - " uiduarum\r\n" - ". . PUNC MORPH=empty .\r\n" - ". . PUNC MORPH=empty .", - result, + self.assertEqual( + result[12], + {"form": "uiduarum", "lemma": "uiduarum", "POS": "fake", "morph": "Case=fake|Numb=fake|Deg=fake|Mood=fake|" + "Tense=fake|Voice=fake|Person=fake", + "treated": "uiduarum"}, "Punctuation should be reinserted and mostly should not break anything" ) @@ -57,24 +56,20 @@ def test_leading_punctuation(self): Special case of consecutive dots, where sentences starts with it """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ + # Need an empty sentence because ( was treated as such "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) result = tagger.tag_str( - "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - formatter_class=formatter, + "( id enim ait) turbabuntur a facie eius patris or phanorum et iudicis uiduarum ..", + processor=processor, iterator=data_iterator ) - self.assertIn( - "form lemma POS morph treated_token\r\n" - "( ( PUNC MORPH=empty (\r\n" - "id id fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake id\r\n" - "enim enim fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake enim\r\n" - "ait ait fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person=fake ait\r\n" - ") ) PUNC MORPH=empty )\r\n" - "turbabuntur turbabuntur fake Case=fake|Numb=fake|Deg=fake|Mood=fake|Tense=fake|Voice=fake|Person" - "=fake turbabuntur\r\n", - result, + tokens = [t["form"] for t in result] + self.assertEqual( + ["(", "id", "enim", "ait", ")", "turbabuntur", "a", "facie", "eius", "patris", "or", "phanorum", + "et", "iudicis", "uiduarum", ".", "."], + tokens, "Leading punctuation should not break anything" ) @@ -82,12 +77,12 @@ def test_punctuation_is_not_seen(self): """Check that punctuation is not seen by the tagger """ - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" ]) tagger.tag_str( "( id enim ait ) turbabuntur a facie eius patris or phanorum et iudicis uiduarum . .", - formatter_class=formatter, + processor=processor, iterator=data_iterator ) self.assertNotIn( @@ -100,28 +95,27 @@ def test_j_are_temporarly_replaced(self): """Check that characters are replaced for the tagger, thus avoiding out of domain, and reinserted """ - tagger, data_iterator, formatter = make_controller([ - "id enim ait", "turbabuntur a facie eius patris or phanorum et iudicis uiduarum" + tagger, data_iterator, processor = make_controller([ + "iudicis uiduarum" ]) result = tagger.tag_str( - "( id enim ait ) turbabuntur a facie eius patris or phanorum et judicis uiduarum . .", - formatter_class=formatter, + "judicis uiduarum", + processor=processor, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) - self.assertNotIn("judicis", flatten_seen, "'j' should be removed from tagging") - self.assertIn("iudicis", flatten_seen, "And 'i' should replace it") - self.assertIn("\njudicis\t", result, "But, in the end, the original form is given to the user") + self.assertEqual(result[0]["form"], "judicis", "'j' should be removed from tagging") + self.assertEqual(result[0]["treated"], "iudicis", "And 'i' should replace it") def test_underscores(self): string = "una operatio in ecclesiae fundamento.._... _ . laetatur autem pater quia filius perierat" - tagger, data_iterator, formatter = make_controller([ + tagger, data_iterator, processor = make_controller([ "una operatio in ecclesiae fundamento", "laetatur autem pater quia filius perierat" ]) tagger.tag_str( string, - formatter_class=formatter, + processor=processor, iterator=data_iterator ) flatten_seen = list([tok for sent in tagger.seen for tok in sent]) @@ -130,4 +124,4 @@ def test_underscores(self): 'perierat'], flatten_seen, "Seen element should not count the underscord" - ) \ No newline at end of file + )