Skip to content

Commit

Permalink
Merge pull request #9 from hipster-philology/postprocessors
Browse files Browse the repository at this point in the history
Postprocessors
  • Loading branch information
PonteIneptique committed Feb 25, 2020
2 parents e5e1284 + 57e6188 commit 39dc39b
Show file tree
Hide file tree
Showing 30 changed files with 1,240 additions and 501 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.idea
/*.txt
pie_extended/downloads/*
tests/**/*.txt
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ install:
# command to run tests
script:
- pie-extended install-addons lasla
- nosetests ./tests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture
- pie-extended download lasla
- nosetests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture --with-doctest
after_success:
- coverage combine
- coveralls
53 changes: 53 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ The current system provide an easier access to adding **customized**:
- disambiguation,
- output formatting

## Install

To install, simply do `pip install pie-extended`. Then, look at all available models.

## Run on terminal

But on top of that, it provides a quick and easy way to use others models ! For example, in a shell :

```bash
Expand All @@ -26,6 +32,53 @@ pie-extended tag laslsa your_file.txt

will give you access to all you need !

## Python API

You can run the lemmatizer in your own scripts and retrieve token annotations as dictionaries:

```python
from typing import List
from pie_extended.cli.sub import get_tagger, get_model, download

# In case you need to download
do_download = False
if do_download:
for dl in download("lasla"):
x = 1

# model_path allows you to override the model loaded by another .tar
model_name = "lasla"
tagger = get_tagger(model_name, batch_size=256, device="cpu", model_path=None)

sentences: List[str] = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit. "]
# Get the main object from the model (: data iterator + postprocesor
from pie_extended.models.lasla import get_iterator_and_processor
for sentence_group in sentences:
iterator, processor = get_iterator_and_processor()
print(tagger.tag_str(sentence_group, iterator=iterator, processor=processor) )
```

will result in

```python
[{'form': 'lorem', 'lemma': 'lor', 'POS': 'NOMcom', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'lorem'},
{'form': 'ipsum', 'lemma': 'ipse', 'POS': 'PROdem', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'ipsum'},
{'form': 'dolor', 'lemma': 'dolor', 'POS': 'NOMcom', 'morph': 'Case=Nom|Numb=Sing', 'treated': 'dolor'},
{'form': 'sit', 'lemma': 'sum1', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3',
'treated': 'sit'},
{'form': 'amet', 'lemma': 'amo', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3',
'treated': 'amet'}, {'form': ',', 'lemma': ',', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ','},
{'form': 'consectetur', 'lemma': 'consector2', 'POS': 'VER',
'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Dep|Person=3', 'treated': 'consectetur'},
{'form': 'adipiscing', 'lemma': 'adipiscor', 'POS': 'VER', 'morph': 'Tense=Pres|Voice=Dep', 'treated': 'adipiscing'},
{'form': 'elit', 'lemma': 'elio', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Ind|Tense=Pres|Voice=Act|Person=3',
'treated': 'elit'}, {'form': '.', 'lemma': '.', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '.'}]
```

## Add a model

ToDo: Documentation

## Warning

This is an extremely early build, subject to change here and there. But it is functional !
21 changes: 18 additions & 3 deletions pie_extended/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import click

from . import sub
from typing import Iterable


MODELS = [name for name, *_ in sub.get_list()]
Expand Down Expand Up @@ -54,15 +55,29 @@ def download(model):
help="Raise error when a file is not tagged correctly")
@click.option("--model_path", type=str, default=None,
help="Provide this with your own model path if you want to test it")
def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path):
@click.option("--reset-exclude-patterns", "reset_patterns", is_flag=True, default=False,
help="Reset exclude patterns")
@click.option("--add-pattern", "add_pattern",
help="Add new exclude patterns for token (Regular expression)", multiple=True)
def tag(model: str, filepath: str, allowed_failure: bool, batch_size: int, device: str, debug: bool,
model_path: str,
reset_patterns: bool, add_pattern: Iterable[str]):
""" Tag as many [filepath] as you want with [model] """
from tqdm import tqdm
click.echo(click.style("Getting the tagger", bold=True))
tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path)
try:
tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path)
except FileNotFoundError as e:
click.echo("Model not found: please make sure you have downloaded the model files with "
"pie-extended download " + model)
if debug:
raise e
return
failures = []
for file in tqdm(filepath):
try:
sub.tag_file(model, tagger, file)
sub.tag_file(model, tagger, file, reset_exclude_patterns=reset_patterns,
exclude_patterns=add_pattern)
except Exception as E:
failures.append(E)
click.echo("{} could not be lemmatized".format(file))
Expand Down
52 changes: 46 additions & 6 deletions pie_extended/cli/sub.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Tuple, Iterable, Generator, Union
from typing import Tuple, Iterable, List, Union
from importlib import import_module

import requests
Expand All @@ -11,11 +11,20 @@
from pie.utils import model_spec


def get_model(model):
def get_model(model: str):
""" Retrieve a module given a string
:param model: Module Name
:return: Module
"""
return import_module("{}.{}".format(models.__name__, model))


def download(module) -> Iterable[Union[str, int]]:
def download(module: str) -> Iterable[Union[str, int]]:
""" Download dependencies for the given module
:param module: Module for which to download models and static files in general
"""
lemmatizer = get_model(module)
os.makedirs(os.path.join(PATH, module), exist_ok=True)
yield len(lemmatizer.DOWNLOADS)
Expand All @@ -30,13 +39,23 @@ def download(module) -> Iterable[Union[str, int]]:


def get_list() -> Iterable[Tuple[str, Metadata]]:
""" Retrieve a list of available modules
"""
for module in models.modules:
desc = getattr(get_model(module), "DESC", None)
if desc:
yield module, desc


def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None) -> ExtensibleTagger:
""" Retrieve the tagger
:param model: Module of the tagger
:param batch_size: Size of the batch
:param device: Device to use (cuda/cpu)
:param model_path: Path to the model if you want to override the package one
:return: Tagger
"""
module = get_model(model)
disambiguator = getattr(module, "Disambiguator", None)
if isinstance(disambiguator, ObjectCreator):
Expand All @@ -48,10 +67,31 @@ def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None)
return tagger


def tag_file(model: str, tagger: ExtensibleTagger, fpath):
def tag_file(
model: str, tagger: ExtensibleTagger,
fpath: str,
reset_exclude_patterns: bool = False,
exclude_patterns: List[str] = None):
""" Tag a file with a given model
:param model: Module name of the model
:param tagger: Tagger that should be used
:param fpath: Path to the file to edit
:param reset_exclude_patterns: Remove all pre-registered token exclusion regular expressions
:param exclude_patterns: New exclude patterns to add to the data iterator (Does not require reset)
"""
module = get_model(model)
iterator, formatter = getattr(module, "get_iterator_and_formatter")()
tagger.tag_file(fpath, iterator=iterator, formatter_class=formatter)
iterator, processor = getattr(module, "get_iterator_and_processor")()
# Remove first pattern
if reset_exclude_patterns:
iterator.reset_patterns()

# Add new
if exclude_patterns:
for pattern in exclude_patterns:
iterator.add_pattern(pattern)

tagger.tag_file(fpath, iterator=iterator, processor=processor)
return True


Expand Down
4 changes: 2 additions & 2 deletions pie_extended/models/fro/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from ...utils import Metadata, File ,get_path
from .classes import get_iterator_and_formatter
from ...utils import Metadata, File, get_path
from .get import get_iterator_and_processor
from ...pipeline.iterators.proto import DataIterator

DESC = Metadata(
Expand Down
152 changes: 0 additions & 152 deletions pie_extended/models/fro/classes.py

This file was deleted.

Loading

0 comments on commit 39dc39b

Please sign in to comment.