Skip to content

Commit

Permalink
Try at making token exclusion easier to configure
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 25, 2020
1 parent efbfc7d commit f278d4a
Show file tree
Hide file tree
Showing 14 changed files with 194 additions and 89 deletions.
13 changes: 11 additions & 2 deletions pie_extended/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import click

from . import sub
from typing import Iterable


MODELS = [name for name, *_ in sub.get_list()]
Expand Down Expand Up @@ -54,8 +55,15 @@ def download(model):
help="Raise error when a file is not tagged correctly")
@click.option("--model_path", type=str, default=None,
help="Provide this with your own model path if you want to test it")
def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path):
@click.option("--reset-exclude-patterns", "reset_patterns", is_flag=True, default=False,
help="Reset exclude patterns")
@click.option("--add-pattern", "add_pattern",
help="Add new exclude patterns for token (Regular expression)", multiple=True)
def tag(model: str, filepath: str, allowed_failure: bool, batch_size: int, device: str, debug: bool,
model_path: str,
reset_patterns: bool, add_pattern: Iterable[str]):
""" Tag as many [filepath] as you want with [model] """
print(reset_patterns, add_pattern)
from tqdm import tqdm
click.echo(click.style("Getting the tagger", bold=True))
try:
Expand All @@ -69,7 +77,8 @@ def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path)
failures = []
for file in tqdm(filepath):
try:
sub.tag_file(model, tagger, file)
sub.tag_file(model, tagger, file, reset_exclude_patterns=reset_patterns,
exclude_patterns=add_pattern)
except Exception as E:
failures.append(E)
click.echo("{} could not be lemmatized".format(file))
Expand Down
48 changes: 44 additions & 4 deletions pie_extended/cli/sub.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Tuple, Iterable, Generator, Union
from typing import Tuple, Iterable, List, Union
from importlib import import_module

import requests
Expand All @@ -11,11 +11,20 @@
from pie.utils import model_spec


def get_model(model):
def get_model(model: str):
""" Retrieve a module given a string
:param model: Module Name
:return: Module
"""
return import_module("{}.{}".format(models.__name__, model))


def download(module) -> Iterable[Union[str, int]]:
def download(module: str) -> Iterable[Union[str, int]]:
""" Download dependencies for the given module
:param module: Module for which to download models and static files in general
"""
lemmatizer = get_model(module)
os.makedirs(os.path.join(PATH, module), exist_ok=True)
yield len(lemmatizer.DOWNLOADS)
Expand All @@ -30,13 +39,23 @@ def download(module) -> Iterable[Union[str, int]]:


def get_list() -> Iterable[Tuple[str, Metadata]]:
""" Retrieve a list of available modules
"""
for module in models.modules:
desc = getattr(get_model(module), "DESC", None)
if desc:
yield module, desc


def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None) -> ExtensibleTagger:
""" Retrieve the tagger
:param model: Module of the tagger
:param batch_size: Size of the batch
:param device: Device to use (cuda/cpu)
:param model_path: Path to the model if you want to override the package one
:return: Tagger
"""
module = get_model(model)
disambiguator = getattr(module, "Disambiguator", None)
if isinstance(disambiguator, ObjectCreator):
Expand All @@ -48,9 +67,30 @@ def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None)
return tagger


def tag_file(model: str, tagger: ExtensibleTagger, fpath):
def tag_file(
model: str, tagger: ExtensibleTagger,
fpath: str,
reset_exclude_patterns: bool = False,
exclude_patterns: List[str] = None):
""" Tag a file with a given model
:param model: Module name of the model
:param tagger: Tagger that should be used
:param fpath: Path to the file to edit
:param reset_exclude_patterns: Remove all pre-registered token exclusion regular expressions
:param exclude_patterns: New exclude patterns to add to the data iterator (Does not require reset)
"""
module = get_model(model)
iterator, processor = getattr(module, "get_iterator_and_processor")()
# Remove first pattern
if reset_exclude_patterns:
iterator.reset_patterns()

# Add new
if exclude_patterns:
for pattern in exclude_patterns:
iterator.add_pattern(pattern)

tagger.tag_file(fpath, iterator=iterator, processor=processor)
return True

Expand Down
10 changes: 6 additions & 4 deletions pie_extended/models/fro/get.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .processor import FroRulesProcessor, FroGlueProcessor
from .processor import FroRulesProcessor, FroGlueProcessor, FroMapProcessor
from .tokenizer import FroMemorizingTokenizer
from pie_extended.pipeline.iterators.proto import DataIterator
from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns
from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor


Expand All @@ -10,12 +10,14 @@ def get_iterator_and_processor():
apply_on_reinsert=True,
head_processor=MemoryzingProcessor(
tokenizer_memory=tokenizer,
head_processor=FroGlueProcessor()
head_processor=FroGlueProcessor(
head_processor=FroMapProcessor()
)
)
)
iterator = DataIterator(
tokenizer=tokenizer,
remove_from_input=DataIterator.remove_punctuation
exclude_patterns=[GenericExcludePatterns.Punctuation_and_Underscore]
)
return iterator, processor

15 changes: 11 additions & 4 deletions pie_extended/models/fro/processor.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import regex as re
from typing import Dict
from typing import Dict, Pattern

from pie_extended.pipeline.postprocessor.glue import GlueProcessor
from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor
from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor


class FroRulesProcessor(RuleBasedProcessor):
""" Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically
"""
PONCTU = re.compile(r"^\W+$")
NUMBER = re.compile(r"\d+")
PONCTU: Pattern = re.compile(r"^\W+$")
NUMBER: Pattern = re.compile(r"\d+")
PONFORT = [".", "...", "!", "?"]

def rules(self, annotation: Dict[str, str]) -> Dict[str, str]:
Expand All @@ -35,9 +36,15 @@ class FroGlueProcessor(GlueProcessor):
"""
OUTPUT_KEYS = ["form", "lemma", "POS", "morph"]
GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]}
MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."}
EMPTY_TAG: Dict[str, str] = {"CAS": "_", "NOMB.": "_", "DEGRE": "_", "MODE": "_", "TEMPS": "_", "GENRE": "_",
"PERS.": "_"}

def __init__(self, *args, **kwargs):
super(FroGlueProcessor, self).__init__(*args, **kwargs)


class FroMapProcessor(RenamedTaskProcessor):
MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."}

def __init__(self, *args, **kwargs):
super(FroMapProcessor, self).__init__(*args, **kwargs)
10 changes: 6 additions & 4 deletions pie_extended/models/lasla/get.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import regex as re

from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor
from pie_extended.models.lasla.processor import LatinRulesProcessor, LatinGlueProcessor, LatinMapProcessor
from pie_extended.models.lasla.tokenizer import LatMemorizingTokenizer
from pie_extended.pipeline.iterators.proto import DataIterator
from pie_extended.pipeline.iterators.proto import DataIterator, GenericExcludePatterns
from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor

# Uppercase regexp
Expand All @@ -15,11 +15,13 @@ def get_iterator_and_processor():
apply_on_reinsert=True,
head_processor=MemoryzingProcessor(
tokenizer_memory=tokenizer,
head_processor=LatinGlueProcessor()
head_processor=LatinGlueProcessor(
LatinMapProcessor()
)
)
)
iterator = DataIterator(
tokenizer=tokenizer,
remove_from_input=DataIterator.remove_punctuation
exclude_patterns=[GenericExcludePatterns.Punctuation_and_Underscore]
)
return iterator, processor
9 changes: 8 additions & 1 deletion pie_extended/models/lasla/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from pie_extended.pipeline.postprocessor.glue import GlueProcessor
from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor
from pie_extended.pipeline.postprocessor.proto import RenamedTaskProcessor


class LatinRulesProcessor(RuleBasedProcessor):
Expand Down Expand Up @@ -33,7 +34,13 @@ class LatinGlueProcessor(GlueProcessor):
OUTPUT_KEYS = ["form", "lemma", "POS", "morph"]
GLUE = {"morph": ["Case", "Numb", "Deg", "Mood", "Tense", "Voice", "Person"]}
WHEN_EMPTY = {"morph": "MORPH=empty"}

def __init__(self, *args, **kwargs):
super(LatinGlueProcessor, self).__init__(*args, **kwargs)


class LatinMapProcessor(RenamedTaskProcessor):
MAP = {"pos": "POS"}

def __init__(self, *args, **kwargs):
super(LatinGlueProcessor, self).__init__(*args, **kwargs)
super(LatinMapProcessor, self).__init__(*args, **kwargs)
94 changes: 70 additions & 24 deletions pie_extended/pipeline/iterators/proto.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,97 @@
import regex as re

from pie.tagger import simple_tokenizer
from typing import Callable, List, Tuple, Dict, Union, Iterable
from typing import List, Tuple, Dict, Iterable, Pattern, Union

from ...utils import ObjectCreator
from ..tokenizers.simple_tokenizer import SimpleTokenizer
from pie_extended.pipeline.tokenizers.simple_tokenizer import SimpleTokenizer
from enum import Enum

Remover = Callable[[List[str]], Tuple[List[str], Dict[int, str]]]
PUNKT = re.compile(r"^[_||[^\s\w]]+$", re.VERSION1)

class GenericExcludePatterns(Enum):
""" Useful set of regular expresion that can be used for the exclude_patterns
"""
Punctuation_and_Underscore: Pattern = re.compile(r"^[_||[^\s\w]]+$", re.VERSION1)
Punctuation: Pattern = re.compile(r"^[^\s\w]+$")
PassageMarker: Pattern = re.compile(r"_Passage_[\w\d_]+") # Use `_` as a joining character


class DataIterator:
def __init__(self, tokenizer: SimpleTokenizer = None, remove_from_input: Callable = None):
def __init__(self, tokenizer: SimpleTokenizer = None, exclude_patterns: List[Union[str, Pattern]] = None):
""" Iterator used to parse the text and returns bits to tag
:param tokenizer: Tokenizer
"""
self.tokenizer: SimpleTokenizer = tokenizer or SimpleTokenizer()
self.remove_from_input = remove_from_input
if self.remove_from_input is None:
self.remove_from_input = lambda x: (x, {})
self.exclude_patterns: List[Pattern] = []
if exclude_patterns:
for pattern in exclude_patterns:
self.add_pattern(pattern)

def add_pattern(self, pattern: str):
""" Add a pattern for removal
:param pattern: Pattern for token removal
"""
if isinstance(pattern, str):
self.exclude_patterns.append(re.compile(pattern))
elif hasattr(pattern, "value"): # Deal with enum
self.exclude_patterns.append(pattern.value)
else:
self.exclude_patterns.append(pattern)

def reset_patterns(self) -> None:
""" Removes removal patterns
>>> x = DataIterator(exclude_patterns=[r'\W+'])
>>> x.exclude_tokens(["Je", "suis", "content", ",", "mais", "...", '"', "fatigué", '"', "."])
(['Je', 'suis', 'content', 'mais', 'fatigué'], {3: ',', 5: '...', 6: '"', 8: '"', 9: '.'})
>>> x.reset_patterns()
>>> x.exclude_tokens(["Je", "suis", "content", ",", "mais", "...", '"', "fatigué", '"', "."])
(['Je', 'suis', 'content', ',', 'mais', '...', '"', 'fatigué', '"', '.'], {})
"""
self.exclude_patterns = []

@staticmethod
def remove_punctuation(sentence: List[str]) -> Tuple[List[str], Dict[int, str]]:
def exclude_tokens(self, sentence: List[str]) -> Tuple[List[str], Dict[int, str]]:
""" Removes punctuation from a list and keeps its index
:param sentence:
:return: First the sentence with things removed, then a dictionary whose keys are index of token to reinsert and
associated values are punctuation to reinsert.
>>> x = DataIterator.remove_punctuation(["Je", "suis", "content",",", "mais", "...", '"', "fatigué", '"', "."])
>>> assert x == (['Je', 'suis', 'content', 'mais', 'fatigué'], {3: ',', 5: '...', 6: '"', 8: '"', 9: '.'})
You can use string when generating the exclude_pattern
>>> x = DataIterator(exclude_patterns=[r'\W+'])
>>> x.exclude_tokens(["Je", "suis", "content",",", "mais", "...", '"', "fatigué", '"', "."])
(['Je', 'suis', 'content', 'mais', 'fatigué'], {3: ',', 5: '...', 6: '"', 8: '"', 9: '.'})
Pre-built removers:
>>> x = DataIterator(exclude_patterns=[GenericExcludePatterns.PassageMarker])
>>> x.exclude_tokens(["_Passage_45_78", "Ici", "commence", "le", "passage"])
(['Ici', 'commence', 'le', 'passage'], {0: '_Passage_45_78'})
And of course you can ignore this option
>>> x = DataIterator()
>>> x.exclude_tokens(["_Passage_45_78", "Ici", "commence", "le", "passage"])
(['_Passage_45_78', 'Ici', 'commence', 'le', 'passage'], {})
"""
if len(self.exclude_patterns) == 0:
return sentence, {}

clean, removed = [], {}
for index, token in enumerate(sentence):
if PUNKT.match(token):
removed[index] = token
else:
match = False
for exclude_pattern in self.exclude_patterns:
if exclude_pattern.match(token):
removed[index] = token
match = True
break
if not match:
clean.append(token)
return clean, removed

def get_remover(self) -> Remover:
if isinstance(self.remove_from_input, ObjectCreator):
return self.remove_from_input.create()
return self.remove_from_input

def __call__(self, data: str, lower: bool = False) -> Iterable[Tuple[List[str], int, Dict[int, str]]]:
""" Default iter data takes a text, an option to make lower
and yield lists of words along with the length of the list
Expand All @@ -53,7 +100,6 @@ def __call__(self, data: str, lower: bool = False) -> Iterable[Tuple[List[str],
:param lower: Whether or not to lower the text
:yields: (Sentence as a list of word, Size of the sentence, Elements removed from the sentence)
"""
remover = self.get_remover()
for sentence in self.tokenizer.sentence_tokenizer(data, lower=lower):
clean_sentence, removed_from_input = remover(sentence)
clean_sentence, removed_from_input = self.exclude_tokens(sentence)
yield clean_sentence, len(clean_sentence), removed_from_input
22 changes: 0 additions & 22 deletions pie_extended/pipeline/postprocessor/disambiguator.py

This file was deleted.

0 comments on commit f278d4a

Please sign in to comment.