Skip to content

Commit

Permalink
(Not working) First attempt but not working. Need to debug. No text i…
Browse files Browse the repository at this point in the history
…s sent apparently...
  • Loading branch information
PonteIneptique committed Feb 6, 2020
1 parent 2f4ba51 commit b80ac11
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 6 deletions.
84 changes: 79 additions & 5 deletions pie_extended/models/fro/classes.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,77 @@
import regex as re
from typing import List
from ...pipeline.formatters.glue import GlueFormatter as SourceGlueFormatter
from pie_extended.pipeline.tokenizers.memorizing import MemorizingTokenizer as SourceMemorizingTokenizer
from pie_extended.pipeline.iterators.proto import DataIterator

# Uppercase regexp
uppercase = re.compile("^[A-Z]$")
_uppercase = re.compile("^[A-ZÉÈÀÂÊÎÔÛŶÄËÏÖÜŸ]$")

_Dots_except_apostrophe = r".?!\"“”\"«»…\[\]\(\)„“"
_Dots_collections = r"[" + _Dots_except_apostrophe + "‘’]"
_RomanNumber = r"(?:M{1,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})" \
r"(?:IX|IV|V?I{0,3})|M{0,4}(?:CM|C?D|D?C{1,3})" \
r"(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})|M{0,4}" \
r"(?:CM|CD|D?C{0,3})(?:XC|X?L|L?X{1,3})" \
r"(?:IX|IV|V?I{0,3})|M{0,4}(?:CM|CD|D?C{0,3})" \
r"(?:XC|XL|L?X{0,3})(?:IX|I?V|V?I{1,3}))"


class MemorizingTokenizer(SourceMemorizingTokenizer):
re_add_space_around_punct = re.compile(r"(\s*)([^\w\s\'’ʼ]+)(\s*)")
re_add_space_after_apostrophe = re.compile(r"(\s*)([\'’ʼ])(\s*)")
re_normalize_space = re.compile(r"(\s+)")
re_sentence_tokenizer = re.compile(r"([_||[^\s\w]]+(?:[\s_||[\W]]+)?)", re.VERSION1)
re_word_tokenizer = re.compile(r"[\s]+")
_sentence_boundaries = re.compile(
r"(?<!" + _RomanNumber + r"\.)(?<=" + _Dots_collections + r"+)(\B)(?!\." + _RomanNumber + ")"
)

def __init__(self):
self.tokens = []

@classmethod
def _sentence_tokenizer(cls, string: str) -> List[str]:
string = cls._sentence_boundaries.sub("\g<1><SPLIT>", string)
return string.split("<SPLIT>")

def word_tokenizer(self, data):
# ICI, il faut que tu tokenizes toi-meme avec une fonction à toi
return data.split()

def sentence_tokenizer(self, data):
sentences = list()
first_is_dot = False
started_writting = False # Allows for avoiding to compute length
for sent in MemorizingTokenizer.re_sentence_tokenizer.split(data):
sent = sent.strip()
sentences.append(sent)

yield from sentences

def replacer(self, inp: str):
# inp = inp.replace("U", "V").replace("v", "u").replace("J", "I").replace("j", "i").lower()
inp = self.re_add_space_after_apostrophe.sub("", inp)
return inp

def normalizer(self, data: str):
data = self.re_add_space_after_apostrophe.sub(
"\g<2> ",
self.re_add_space_around_punct.sub(" \g<2> ", data)
)
return data


class GlueFormatter(SourceGlueFormatter):
HEADERS = ["form", "lemma", "POS", "morph"]
HEADERS = ["form", "lemma", "POS", "morph", "treated_token"]
MORPH_PART = ["MODE", "TEMPS", "PERS.", "NOMB.", "GENRE", "CAS", "DEGRE"]

PONCTU = re.compile(r"^\W+$")
NUMBER = re.compile(r"\d+")
PONFORT = [".", "...", "!", "?"]

def __init__(self, tasks):
def __init__(self, tasks: List[str], tokenizer_memory: MemorizingTokenizer):
super(GlueFormatter, self).__init__(tokenizer_memory=tokenizer_memory)
self.tasks = tasks
self.pos_tag = "POS"
if "POS" not in self.tasks and "pos" in self.tasks:
Expand All @@ -31,12 +89,16 @@ def rule_based(cls, token):
def format_line(self, token, tags, ignored=False):
tags = list(tags)
lemma = tags[self.tasks.index("lemma")]
index, input_token, out_token = self.tokenizer_memory.tokens.pop(0)
if token != out_token:
raise Exception("The output token does not match our inputs %s : %s" % (token, out_token))

overwriten = self.rule_based(token)
if overwriten:
return overwriten

if type(self).NUMBER.match(token):
if type(self).NUMBER.match(token): # This would push for sending the whole elements to rule_based and
# not the token only
lemma = token
tags[self.tasks.index(self.pos_tag)] = "ADJcar"

Expand All @@ -52,5 +114,17 @@ def format_line(self, token, tags, ignored=False):
for morph_part in GlueFormatter.MORPH_PART
if morph_part.replace(".", "") in self.tasks and
tags[self.tasks.index(morph_part.replace(".", ""))] != "_"
) or "MORPH=empty"
) or "MORPH=empty",
out_token
]


def get_iterator_and_formatter():
tokenizer = MemorizingTokenizer()
formatter = GlueFormatter(tokenizer)
iterator = DataIterator(
tokenizer=tokenizer,
remove_from_input=DataIterator.remove_punctuation
)
return iterator, formatter

2 changes: 1 addition & 1 deletion pie_extended/pipeline/formatters/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class GlueFormatter(Formatter):

def __init__(self, tokenizer_memory):
super(GlueFormatter, self).__init__([])
self.tokenizer_memorytokenizer_memory = tokenizer_memory
self.tokenizer_memory = tokenizer_memory

def __call__(self, tasks):
super(GlueFormatter, self).__init__(tasks)
Expand Down

0 comments on commit b80ac11

Please sign in to comment.