Skip to content

Commit

Permalink
Merge efbfc7d into e5e1284
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Feb 22, 2020
2 parents e5e1284 + efbfc7d commit 94ca1fe
Show file tree
Hide file tree
Showing 28 changed files with 889 additions and 466 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ install:
# command to run tests
script:
- pie-extended install-addons lasla
- nosetests ./tests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture
- nosetests --with-coverage --cover-package=pie_extended --cover-xml --verbose --nologcapture --with-doctest
after_success:
- coverage combine
- coveralls
53 changes: 53 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ The current system provide an easier access to adding **customized**:
- disambiguation,
- output formatting

## Install

To install, simply do `pip install pie-extended`. Then, look at all available models.

## Run on terminal

But on top of that, it provides a quick and easy way to use others models ! For example, in a shell :

```bash
Expand All @@ -26,6 +32,53 @@ pie-extended tag laslsa your_file.txt

will give you access to all you need !

## Python API

You can run the lemmatizer in your own scripts and retrieve token annotations as dictionaries:

```python
from typing import List
from pie_extended.cli.sub import get_tagger, get_model, download

# In case you need to download
do_download = False
if do_download:
for dl in download("lasla"):
x = 1

# model_path allows you to override the model loaded by another .tar
model_name = "lasla"
tagger = get_tagger(model_name, batch_size=256, device="cpu", model_path=None)

sentences: List[str] = ["Lorem ipsum dolor sit amet, consectetur adipiscing elit. "]
# Get the main object from the model (: data iterator + postprocesor
from pie_extended.models.lasla import get_iterator_and_processor
for sentence_group in sentences:
iterator, processor = get_iterator_and_processor()
print(tagger.tag_str(sentence_group, iterator=iterator, processor=processor) )
```

will result in

```python
[{'form': 'lorem', 'lemma': 'lor', 'POS': 'NOMcom', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'lorem'},
{'form': 'ipsum', 'lemma': 'ipse', 'POS': 'PROdem', 'morph': 'Case=Acc|Numb=Sing', 'treated': 'ipsum'},
{'form': 'dolor', 'lemma': 'dolor', 'POS': 'NOMcom', 'morph': 'Case=Nom|Numb=Sing', 'treated': 'dolor'},
{'form': 'sit', 'lemma': 'sum1', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3',
'treated': 'sit'},
{'form': 'amet', 'lemma': 'amo', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Act|Person=3',
'treated': 'amet'}, {'form': ',', 'lemma': ',', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': ','},
{'form': 'consectetur', 'lemma': 'consector2', 'POS': 'VER',
'morph': 'Numb=Sing|Mood=Sub|Tense=Pres|Voice=Dep|Person=3', 'treated': 'consectetur'},
{'form': 'adipiscing', 'lemma': 'adipiscor', 'POS': 'VER', 'morph': 'Tense=Pres|Voice=Dep', 'treated': 'adipiscing'},
{'form': 'elit', 'lemma': 'elio', 'POS': 'VER', 'morph': 'Numb=Sing|Mood=Ind|Tense=Pres|Voice=Act|Person=3',
'treated': 'elit'}, {'form': '.', 'lemma': '.', 'pos': 'PUNC', 'morph': 'MORPH=empty', 'treated': '.'}]
```

## Add a model

ToDo: Documentation

## Warning

This is an extremely early build, subject to change here and there. But it is functional !
9 changes: 8 additions & 1 deletion pie_extended/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,14 @@ def tag(model, filepath, allowed_failure, batch_size, device, debug, model_path)
""" Tag as many [filepath] as you want with [model] """
from tqdm import tqdm
click.echo(click.style("Getting the tagger", bold=True))
tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path)
try:
tagger = sub.get_tagger(model, batch_size=batch_size, device=device, model_path=model_path)
except FileNotFoundError as e:
click.echo("Model not found: please make sure you have downloaded the model files with "
"pie-extended download " + model)
if debug:
raise e
return
failures = []
for file in tqdm(filepath):
try:
Expand Down
4 changes: 2 additions & 2 deletions pie_extended/cli/sub.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def get_tagger(model: str, batch_size: int = 16, device="cpu", model_path=None)

def tag_file(model: str, tagger: ExtensibleTagger, fpath):
module = get_model(model)
iterator, formatter = getattr(module, "get_iterator_and_formatter")()
tagger.tag_file(fpath, iterator=iterator, formatter_class=formatter)
iterator, processor = getattr(module, "get_iterator_and_processor")()
tagger.tag_file(fpath, iterator=iterator, processor=processor)
return True


Expand Down
4 changes: 2 additions & 2 deletions pie_extended/models/fro/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from ...utils import Metadata, File ,get_path
from .classes import get_iterator_and_formatter
from ...utils import Metadata, File, get_path
from .get import get_iterator_and_processor
from ...pipeline.iterators.proto import DataIterator

DESC = Metadata(
Expand Down
152 changes: 0 additions & 152 deletions pie_extended/models/fro/classes.py

This file was deleted.

21 changes: 21 additions & 0 deletions pie_extended/models/fro/get.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from .processor import FroRulesProcessor, FroGlueProcessor
from .tokenizer import FroMemorizingTokenizer
from pie_extended.pipeline.iterators.proto import DataIterator
from pie_extended.pipeline.postprocessor.memory import MemoryzingProcessor


def get_iterator_and_processor():
tokenizer = FroMemorizingTokenizer()
processor = FroRulesProcessor(
apply_on_reinsert=True,
head_processor=MemoryzingProcessor(
tokenizer_memory=tokenizer,
head_processor=FroGlueProcessor()
)
)
iterator = DataIterator(
tokenizer=tokenizer,
remove_from_input=DataIterator.remove_punctuation
)
return iterator, processor

43 changes: 43 additions & 0 deletions pie_extended/models/fro/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import regex as re
from typing import Dict

from pie_extended.pipeline.postprocessor.glue import GlueProcessor
from pie_extended.pipeline.postprocessor.rulebased import RuleBasedProcessor


class FroRulesProcessor(RuleBasedProcessor):
""" Fro Dataset has not all punctuation signs in it, we remove it and posttag it automatically
"""
PONCTU = re.compile(r"^\W+$")
NUMBER = re.compile(r"\d+")
PONFORT = [".", "...", "!", "?"]

def rules(self, annotation: Dict[str, str]) -> Dict[str, str]:
token = annotation["form"]
if self.PONCTU.match(token):
if token in self.PONFORT:
pos = "PONfrt"
else:
pos = "PONfbl"
return {"form": token, "lemma": token, "POS": pos, "morph": "MORPH=empty", "treated": token}
elif self.NUMBER.match(token):
annotation["pos"] = "ADJcar"
return annotation

def __init__(self, *args, **kwargs):
super(FroRulesProcessor, self).__init__(*args, **kwargs)


class FroGlueProcessor(GlueProcessor):
""" We glue morphological features into one column
"""
OUTPUT_KEYS = ["form", "lemma", "POS", "morph"]
GLUE = {"morph": ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]}
MAP = {"pos": "POS", "NOMB": "NOMB.", "PERS": "PERS."}
EMPTY_TAG: Dict[str, str] = {"CAS": "_", "NOMB.": "_", "DEGRE": "_", "MODE": "_", "TEMPS": "_", "GENRE": "_",
"PERS.": "_"}

def __init__(self, *args, **kwargs):
super(FroGlueProcessor, self).__init__(*args, **kwargs)

0 comments on commit 94ca1fe

Please sign in to comment.