From b40030701c4368f2a9575da4ae3f115f2e46dbfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Tue, 15 Sep 2020 16:41:33 +0200 Subject: [PATCH] (feature/capitalization) Add supports for capitalization transformation --- .gitignore | 1 + DOCUMENTATION.md | 37 +++ protogenie/configs.py | 4 +- protogenie/postprocessing.py | 234 ++++++++++++- protogenie/schema.rng | 93 +++++- protogenie/sentence_matchers.py | 43 +++ tests/test_config/capitalize.xml | 32 ++ tests/test_data/capitalize.tsv | 550 +++++++++++++++++++++++++++++++ tests/test_data/generate.py | 65 +++- tests/test_data/generic.tsv | 550 +++++++++++++++++++++++++++++++ tests/test_postprocessing.py | 187 ++++++++++- 11 files changed, 1757 insertions(+), 39 deletions(-) create mode 100644 protogenie/sentence_matchers.py create mode 100644 tests/test_config/capitalize.xml create mode 100644 tests/test_data/capitalize.tsv create mode 100644 tests/test_data/generic.tsv diff --git a/.gitignore b/.gitignore index 2091994..d766a01 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ memory.csv memory*.csv new.yaml tests/tests_output +tests/test_config/generated.xml # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index fdee235..567ec12 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -361,3 +361,40 @@ It will produces the following output The glue token is not applied on token, the lemma value is transfered to the previous row and the POS is lost. `@glue_char` is used to concatenate columns such as `lemma` here, + +### Capitalization + +This post-processing function capitalizes (*ie.* makes the first letter of words upper-case) randomly or always first +words of chunks (*ie.* sentences) and random letters inside. It also provides an uppercase mask creation, where it +replaces uppercased letters with lowercase letters the [Neutral Chess Queen UTF-8 character](https://www.compart.com/fr/unicode/U+1FA01). + +The model is the following: + +```xml + + + + + + + + + + + + +``` + +1. column-token specifies the name of the column containing the raw form of the tokens +2. (Optional) column-lemma does the same thing for lemma +3. caps-to-utf8-marker activates masking uppercased letters. +4. first-word is activated when when is set to a value between `always`, `random` and `ratio`. + 1. when="ratio" requires a second ratio value which needs to be a float between .0 and 1.0 (a percentage basically) + 2. when=random is basically a shortcut for the latter where ratio=0.5 + 3. To identify sentences, you need to set up sentence-marker + 1. It can be name="empty_line", in which case chunks are separated by empty line (default output) + 2. It can be name="regexp", in which case it takes a `@matchPattern` attribute (for regular expression) + and a column that needs to be matched in `@source`, *.ie* `` +5. first-letters works with the same when/ratio attribute than first-word. It applies said capitalization + to random words inside chunks. + \ No newline at end of file diff --git a/protogenie/configs.py b/protogenie/configs.py index 2e78bd9..44545e9 100644 --- a/protogenie/configs.py +++ b/protogenie/configs.py @@ -5,13 +5,13 @@ from .splitters import RegExpSplitter, LineSplitter, TokenWindowSplitter, FileSplitter, _SplitterPrototype from .reader import Reader -from .postprocessing import Disambiguation, ReplacementSet, Skip, PostProcessing, Clitic +from .postprocessing import Disambiguation, ReplacementSet, Skip, PostProcessing, Clitic, Capitalize from .toolbox import RomanNumeral import datetime from dataclasses import dataclass Splitter = Type[_SplitterPrototype] -PostProcessingClasses = [Disambiguation, ReplacementSet, Skip, RomanNumeral, Clitic] +PostProcessingClasses = [Disambiguation, ReplacementSet, Skip, RomanNumeral, Clitic, Capitalize] @dataclass diff --git a/protogenie/postprocessing.py b/protogenie/postprocessing.py index f967e63..fe09233 100644 --- a/protogenie/postprocessing.py +++ b/protogenie/postprocessing.py @@ -1,12 +1,18 @@ -if False: - from .configs import CorpusConfiguration import tempfile -import regex as re -from xml.etree.ElementTree import Element -import csv -from typing import List, ClassVar, Tuple, Dict +import math +import random from abc import ABC, abstractmethod from collections import namedtuple +from xml.etree.ElementTree import Element +import csv +from typing import List, ClassVar, Tuple, Dict, Optional, TYPE_CHECKING, Union + +import regex as re + +if TYPE_CHECKING: + from .configs import CorpusConfiguration +from .sentence_matchers import SentenceMatcherProto, SentenceRegexpMatcher +Numeric = Union[int, float] class PostProcessing(ABC): @@ -26,6 +32,74 @@ def match_config_node(cls, node: Element) -> bool: """ return node.tag == cls.NodeName + def _modify_line(self, header: List[str], values: Optional[List[str]], + file_path: str, config: "CorpusConfiguration"): + raise NotImplementedError + + def _stop_chunk(self, line: Optional[Dict[str, str]]) -> bool: + raise NotImplementedError + + def _chunk_modify_routine(self, file_path: str, config: "CorpusConfiguration"): + raise NotImplementedError + + def _scan_chunks(self, + file_path: str, config: "CorpusConfiguration", + sentence_matcher: Optional[SentenceMatcherProto]) -> Tuple[int, int]: + """ Analyzes the FILE for the number of chunks + + """ + chunks = 0 + tokens = 0 + with open(file_path) as file: + for nb_line, line in enumerate(file): + vals = line.strip().split(config.column_marker) + + if nb_line == 0: + header = vals + continue + + if len(header) == len(vals): + tokens += 1 + if sentence_matcher and sentence_matcher.match(header, vals): + chunks += 1 + elif sentence_matcher: + chunks += sentence_matcher.match(header, None) + + return chunks, tokens + + def _single_line_modify_routine(self, file_path: str, config: "CorpusConfiguration"): + header: List[str] = [] + temp = tempfile.TemporaryFile(mode="w+") # 2 + + try: + with open(file_path) as file: + for nb_line, line in enumerate(file): + + if not line.strip(): + temp.write(line) + self._modify_line(header, None, file_path, config) + continue + + vals = line.strip().split(config.column_marker) + + if nb_line == 0: + header = vals + temp.write(line) + continue + + modified = self._modify_line(header, vals, file_path=file_path, config=config) + temp.write( + config.column_marker.join( + [modified[head] for head in header] + ) + "\n" + ) + + with open(file_path, "w") as f: + temp.seek(0) + f.write(temp.read()) + finally: + temp.close() # 5 + class ApplyTo: def __init__(self, source: str, target: List[str]): @@ -62,7 +136,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"): header: List[str] = [] for nb_line, line in enumerate(csv_reader): # The file should already have been open if nb_line == 0: - temp.write(config.column_marker.join(line+[self.disambiguation_key])+"\n") + temp.write(config.column_marker.join(line + [self.disambiguation_key]) + "\n") header = line continue elif not line: @@ -79,7 +153,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"): lines[self.lemma_key] = self.match_pattern.sub("", lines[self.lemma_key]) else: lines[self.disambiguation_key] = self.default_value - temp.write(config.column_marker.join(list(lines.values()))+"\n") + temp.write(config.column_marker.join(list(lines.values())) + "\n") with open(file_path, "w") as f: temp.seek(0) f.write(temp.read()) @@ -120,7 +194,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"): header: List[str] = [] for nb_line, line in enumerate(csv_reader): # The file should already have been open if nb_line == 0: - temp.write(config.column_marker.join(line)+"\n") + temp.write(config.column_marker.join(line) + "\n") header = line continue elif not line: @@ -140,7 +214,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"): else: # Otherwise, we just set the target value using this value lines[target] = self.replacement_pattern - temp.write(config.column_marker.join(list(lines.values()))+"\n") + temp.write(config.column_marker.join(list(lines.values())) + "\n") with open(file_path, "w") as f: temp.seek(0) f.write(temp.read()) @@ -162,7 +236,7 @@ class Skip(PostProcessing): NodeName = "skip" def __init__( - self, match_pattern: str, source: str + self, match_pattern: str, source: str ): super(Skip, self).__init__() self.match_pattern: re.Regex = re.compile(match_pattern) @@ -177,7 +251,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"): header: List[str] = [] for nb_line, line in enumerate(csv_reader): # The file should already have been open if nb_line == 0: - temp.write(config.column_marker.join(line)+"\n") + temp.write(config.column_marker.join(line) + "\n") header = line continue elif not line: @@ -190,7 +264,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"): if self.match_pattern.search(lines[self.source]): continue - temp.write(config.column_marker.join(list(lines.values()))+"\n") + temp.write(config.column_marker.join(list(lines.values())) + "\n") with open(file_path, "w") as f: temp.seek(0) @@ -214,7 +288,7 @@ class Clitic(PostProcessing): Transfer = namedtuple("Transfer", ["col", "glue"]) def __init__( - self, match_pattern: str, source: str, glue: str, transfers: List[Tuple[str, bool]] + self, match_pattern: str, source: str, glue: str, transfers: List[Tuple[str, bool]] ): super(Clitic, self).__init__() self.match_pattern: re.Regex = re.compile(match_pattern) @@ -239,7 +313,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"): modifications: List[Tuple[int, Dict[str, Tuple[str, str]]]] = [] for nb_line, line in enumerate(csv_reader): # The file should already have been open if nb_line == 0: - temp.write(config.column_marker.join(line)+"\n") + temp.write(config.column_marker.join(line) + "\n") header = line continue elif not line: @@ -253,7 +327,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"): temp.write("\n".join([ config.column_marker.join(list(l.values())) for l in sequence - ])+"\n") + ]) + "\n") sequence = [] modifications = [] continue @@ -264,7 +338,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"): if self.match_pattern.match(lines[self.source]): modifications.append( ( - len(sequence) - 1 -len(modifications), + len(sequence) - 1 - len(modifications), {key: (keep, lines[key]) for (key, keep) in self.transfers} ) ) @@ -293,3 +367,129 @@ def from_xml(cls, node: Element) -> "Clitic": for tr in node.findall("transfer") ] ) + + +class Capitalize(PostProcessing): + """ Applies capitalization strategies to content + """ + NodeName = "capitalize" + Marker: str = "🨁" # NEUTRAL CHESS QUEEN + RE_Upper: re.Regex = re.compile("(\p{Lu})") + + def __init__(self, first_word: Numeric, first_letters: Numeric, + column_token: str, + column_lemma: Optional[str] = None, + apply_unicode_marker: bool = False, + sentence_matcher: Optional[SentenceMatcherProto] = None): + + self.first_word: Numeric = first_word + self.first_letters: Numeric = first_letters + + self.column_token: str = column_token + self.column_lemma: Optional[str] = column_lemma + self.apply_unicode_marker: bool = apply_unicode_marker + self.sentence_matcher: Optional[SentenceMatcherProto] = sentence_matcher + self.first_word_state: bool = True # Variable representing the current status + # (True = next word is a first word) + + self._files_chunks: Dict[str, List[bool]] = {} + self._files_tokens: Dict[str, List[bool]] = {} + + @staticmethod + def parse_when(value: str, ratio: Optional[str]) -> Numeric: + if value == "always": + return 1 + elif value == "never": + return 0 + elif value == "random": + return 0.5 + elif ratio: + try: + if 1.0 > float(ratio) > .0: + return float(ratio) + except: + raise ValueError("Your ration value is probably wrong. They must be < 1.0 (Found: {})".format(ratio)) + raise ValueError("Invalid parameters for a ratio or an application") + + @classmethod + def parse_node_including_when(cls, node: Element, name: str) -> Tuple[Numeric, Optional[Element]]: + target = node.findall("./{name}".format(name=name)) + if target: + return cls.parse_when(target[0].attrib["when"], target[0].attrib.get("ratio")), target[0] + return 0, None + + def _modify_line(self, header: List[str], values: Optional[List[str]], + file_path: str, config: "CorpusConfiguration") -> Dict[str, str]: + if self.first_word and self.sentence_matcher.match(header, values): + self.first_word_state = True + return dict(zip(header, values)) + + if not values or len(header) != len(values): + return {} + + line = dict(zip(header, values)) + + # Sentence starts + if self.first_word > .0 and self.first_word_state and self._files_chunks[file_path].pop(): + line[self.column_token] = line[self.column_token].capitalize() + # Need to pop tokens as well + if self.first_letters: + self._files_tokens[file_path].pop() + elif self.first_letters > .0 and self._files_tokens[file_path].pop(): + line[self.column_token] = line[self.column_token].capitalize() + + line[self.column_token] = self.RE_Upper.sub(self._replace_caps, line[self.column_token]) + + self.first_word_state = False + return line + + def _replace_caps(self, value): + return value.group().lower()+self.Marker + + @staticmethod + def _transform_to_bool_list(count: int, ratio: Numeric) -> List[bool]: + if ratio == 1.0: + return [True] * count + elif ratio == .0: + return [False] * count + else: + positives = min(round(count * ratio), count) + negatives = count - positives + out = [True] * positives + [False] * negatives + random.shuffle(out) + return out + + def apply(self, file_path: str, config: "CorpusConfiguration"): + # We scan the files + chunks, tokens = self._scan_chunks(file_path, config, sentence_matcher=self.sentence_matcher) + + # We store the dispatch of booleans + if self.first_word > .0: + self._files_chunks[file_path] = self._transform_to_bool_list(chunks, self.first_word) + if self.first_letters > .0: + self._files_tokens[file_path] = self._transform_to_bool_list(tokens, self.first_letters) + + self._single_line_modify_routine(file_path=file_path, config=config) + + @classmethod + def from_xml(cls, node: Element) -> "Capitalize": + first_word, first_word_elem = cls.parse_node_including_when(node, "first-word") + first_letters, _ = cls.parse_node_including_when(node, "first-letters") + sentence_marker = None + if first_word != .0: + try: + sentence_marker = SentenceMatcherProto.from_xml( + first_word_elem.findall("./sentence-marker")[0] + ) + except IndexError: + print("You forgot to use a sentence marker.") + raise Exception + + return cls( + first_word=first_word, + first_letters=first_letters, + sentence_matcher=sentence_marker, + apply_unicode_marker=node.attrib.get("utf8-marker-for-caps", "true").lower() == "true", + column_token=node.attrib["column-token"], + column_lemma=node.attrib.get("column-lemma") + ) diff --git a/protogenie/schema.rng b/protogenie/schema.rng index 4b55717..de0ad78 100644 --- a/protogenie/schema.rng +++ b/protogenie/schema.rng @@ -69,9 +69,27 @@ + + + + + + + + + + NFD + NFKD + NFC + NFKC + + + + + Deals with enclitics @@ -102,6 +120,65 @@ + + + + + always + never + random + + + + + + ratio + + + + Needs to be smaller or equal to 1 + + + + + + + A node for setting sentence recognition + + + + + empty_line + + + + + + + Define capitalization strategies + + + + + + Convert capitals to lower-case + UTF8 marker. It inserts the 🨁 character after capitals (NEUTRAL CHESS QUEEN) + + + + + + + Capitalize first word + + + + + Capitalize first letter + + + + + Functions pre-added to Protegeneia @@ -273,6 +350,16 @@ + + Regexp choice that applies a matchPattern on given column + + + regexp + + + + + @@ -287,11 +374,7 @@ - - regexp - - - + Each empty line represent a new passage diff --git a/protogenie/sentence_matchers.py b/protogenie/sentence_matchers.py new file mode 100644 index 0000000..9cbd998 --- /dev/null +++ b/protogenie/sentence_matchers.py @@ -0,0 +1,43 @@ +__all__ = ["SentenceMatcherProto", "SentenceRegexpMatcher", "SentenceEmptyLineMatcher"] + +from abc import ABC +from xml.etree.ElementTree import Element +from typing import Optional, List + +import regex as re + + +class SentenceMatcherProto: + def match(self, headers: List[str], values: Optional[List[str]]): + raise NotImplementedError() + + @classmethod + def from_xml(cls, node: Element) -> "SentenceMatcherProto": + if node.attrib["name"] == "regexp": + return SentenceRegexpMatcher.from_xml(node.xpath("./sentence-marker")[0]) + return SentenceEmptyLineMatcher() + + +class SentenceRegexpMatcher(SentenceMatcherProto): + def __init__(self, regexp: re.Regex, column: str): + self.regexp: re.Regex = regexp + self.column: str = column + + def match(self, headers: List[str], values: Optional[List[str]]): + data = dict(zip(headers, values)) + return self.regexp.match(data[self.column]) is not None + + @classmethod + def from_xml(cls, node: Element) -> "SentenceRegexpMatcher": + return cls( + regexp=re.compile(node.attrib["regexp"]), + column=node.attrib["column"] + ) + + +class SentenceEmptyLineMatcher(SentenceMatcherProto): + def __init__(self): + pass + + def match(self, headers: List[str], values: Optional[List[str]]): + return not values or len(headers) != len(values) diff --git a/tests/test_config/capitalize.xml b/tests/test_config/capitalize.xml new file mode 100644 index 0000000..5955ca6 --- /dev/null +++ b/tests/test_config/capitalize.xml @@ -0,0 +1,32 @@ + + + + +
+ token + lemma + pos +
+
+ + + + + + + + + +
+ 2 + 0 + 1 +
+
+ + + +
+ + + \ No newline at end of file diff --git a/tests/test_data/capitalize.tsv b/tests/test_data/capitalize.tsv new file mode 100644 index 0000000..786b6ff --- /dev/null +++ b/tests/test_data/capitalize.tsv @@ -0,0 +1,550 @@ +lem_gfdqnu pos_nwuzjq tok_pdsbeb +lem_jtuhwc pos_qkvpdq tok_ebxvhz +lem_guxqgv pos_ehupak tok_jdswrg +lem_sjggqt pos_htayhs tok_oyrpif +lem_vhaocb pos_bnvexi tok_uwqjtj +lem_wljofl pos_uwlevn tok_xijnla +lem_pyzxcr pos_wcngiu tok_qqqzbb +lem_vmhdhe pos_injexe tok_kjthhu +lem_mzipht pos_wihweo tok_wmhaey +lem_uvxkzg pos_pufjla tok_ixrckf + +lem_jtphzk pos_pmcitq tok_dnkwur +lem_dzzgwa pos_gnginp tok_smqdlt +lem_dfvmoh pos_thsqcw tok_czqghu +lem_ylldjx pos_vpnzdz tok_mgqdog +lem_gdcymo pos_qizuyj tok_mpoyge +lem_blswmx pos_ofewxd tok_yrhazm +lem_vbmwsn pos_pcbrqn tok_hyndvw +lem_rsigog pos_hsbjaa tok_bncvto +lem_uizcsj pos_gxyohx tok_ckqjwy +lem_anreid pos_xofyih tok_ivekmd + +lem_mfjurv pos_iuvqwl tok_rtlxxu +lem_udnmaj pos_kxiyzl tok_wuzxxc +lem_clmsdl pos_rqplkq tok_welcvi +lem_vyseky pos_dcpbem tok_heyjym +lem_hxbufp pos_wxpsqy tok_xurumg +lem_zbwpxc pos_bmluia tok_ykqaom +lem_gfeafu pos_xhjhwu tok_hpbiur +lem_ommham pos_gzvjcb tok_boehfy +lem_lxvuzv pos_foadfp tok_xpfyxf +lem_eckpfg pos_rlttsv tok_yyauxh + +lem_ehejdy pos_fkefzu tok_cowfrs +lem_wzndfl pos_fwxmlr tok_fxjlax +lem_uirpeg pos_cubpto tok_gxppbk +lem_ldmmwy pos_mzfrsw tok_kceqfn +lem_oeehst pos_pzjxga tok_xcrmgg +lem_waehpw pos_hqwknv tok_iuhaqi +lem_yqmcym pos_welheb tok_ozaipr +lem_uizoap pos_wtsecz tok_xeklhd +lem_elqjaf pos_xizycu tok_yfwiys +lem_oxbpma pos_hvejep tok_xgmsfz + +lem_kvonsd pos_acwxgm tok_ssbbsk +lem_drxcvj pos_oxxoih tok_hqwmpn +lem_frvtls pos_yuvdbn tok_nvafrq +lem_ihzvwr pos_huxeyz tok_uavkty +lem_pukxrr pos_ngcjfm tok_kiezrp +lem_wjmgcn pos_nyvleq tok_wpdkuk +lem_ixakwb pos_qedopt tok_hqmzxa +lem_fyjaic pos_velfce tok_ljcyrl +lem_ithvcs pos_qyjmzo tok_lgpfek +lem_olqbjx pos_hhaooc tok_ixqaah + +lem_avfwrs pos_jecmuf tok_ezidmw +lem_zlaiff pos_wvbwgv tok_hpurdm +lem_wyqkuu pos_bwxveg tok_txoyqf +lem_zropgk pos_yrmmgv tok_hwhagm +lem_uueify pos_nnvfxt tok_zgzpjp +lem_fpwefq pos_uolyfd tok_bdoanb +lem_skyiuu pos_upgzkh tok_pmyggs +lem_ohjrbd pos_snnezp tok_rczdgg +lem_xkccmp pos_oefqfz tok_paxfjn +lem_njkemd pos_fslnyu tok_ykhung + +lem_qiozfx pos_aztlxz tok_zigmkf +lem_djxxdu pos_imssir tok_oaqwmu +lem_mvbuiq pos_wnltzf tok_apynxb +lem_zinbsr pos_guiiql tok_itjxly +lem_pppsal pos_fgmnxp tok_eouvcv +lem_zlwwzn pos_sbrkip tok_yairqa +lem_kxafmo pos_tahrqi tok_haamwe +lem_kuqyxs pos_nnxnis tok_joavzm +lem_bfdfkr pos_dkjomn tok_wmppol +lem_ozcdjl pos_livcam tok_jolxma + +lem_gcqjba pos_oexwyf tok_doqyhm +lem_fntvul pos_aaquyc tok_xwrhvp +lem_umdplq pos_ktbxir tok_ekqehw +lem_jlwmol pos_fwlyvv tok_twzgrz +lem_uxcskg pos_fpktiw tok_piuzzu +lem_hsexdj pos_iwfxgq tok_jvlwgf +lem_atoxnh pos_hbcazn tok_jsqqxs +lem_annswt pos_dzaaev tok_ajctrg +lem_ifofsa pos_mudsye tok_urkpiu +lem_xikciw pos_syvqqc tok_etbvum + +lem_prsbxb pos_ynrcao tok_zddvgs +lem_onskyy pos_lfzsnd tok_wvgtey +lem_xuxtls pos_tkedxb tok_hgrctv +lem_tpmfdm pos_lipmdq tok_xbgrey +lem_ktqkgr pos_nxluxn tok_bithif +lem_zkhxhp pos_mbsoji tok_amddgu +lem_kyfaws pos_eaftpm tok_bhwvlh +lem_wzyjdt pos_anpbgf tok_oynffv +lem_ydgmla pos_xgdoql tok_nnwcrg +lem_hmmkut pos_djuwct tok_qqjmhm + +lem_fjgkar pos_tqhsjt tok_iikqst +lem_wuwaqx pos_rhaxfn tok_ebpybx +lem_vijhlp pos_ezddcb tok_exkcff +lem_wpmesr pos_wyzeof tok_ouujmp +lem_rlsmrl pos_hrlhmg tok_bsmjps +lem_bawppm pos_tppuyt tok_ptdpzk +lem_ffvgll pos_qoyslz tok_evadlh +lem_xjhrsf pos_mmtjot tok_efiswk +lem_mznezl pos_ygixbi tok_epdico +lem_mwjnak pos_rqjenm tok_aaghdw + +lem_cfugrn pos_thkymx tok_gyagep +lem_cravnv pos_hixfyo tok_bhanmo +lem_rdplck pos_lmifvp tok_ngmgfa +lem_qtuhak pos_zchxwy tok_dhtcal +lem_qeqjwv pos_dkmfbt tok_rdfsym +lem_pjkmak pos_tftbhs tok_neukiv +lem_pcetew pos_pvulyn tok_hqfkyn +lem_ajoxth pos_zruhqw tok_ebsfpd +lem_osvhzh pos_scckey tok_dishno +lem_hyursx pos_rshgty tok_ysxzwq + +lem_lqpirp pos_jbpolq tok_vazafx +lem_rvdeuo pos_dzmccw tok_gegpks +lem_jwqffg pos_lnxodl tok_jgxgpz +lem_rjwuwy pos_cadats tok_yeqiow +lem_ueesqu pos_nwiiwv tok_pohhea +lem_emfuar pos_jvirac tok_abqqji +lem_ogfjjh pos_szsuza tok_tabgvm +lem_pndamg pos_rwgldl tok_tqudai +lem_iywpiq pos_djndzr tok_hxjklu +lem_vqcqqo pos_onxyxw tok_zugext + +lem_ajemad pos_abjkzf tok_rjevuf +lem_bxqhej pos_rtzgmx tok_irrktm +lem_druavn pos_nujegv tok_ectxkw +lem_lyasdg pos_mnutvg tok_fwvvrc +lem_sfznmk pos_lfxwgr tok_szaavf +lem_sbsjdd pos_dozpjr tok_zbzknd +lem_kvgles pos_fhogxy tok_taccsm +lem_fcqtpx pos_tmgcrb tok_qbkikd +lem_psvbfk pos_jqyskj tok_rduell +lem_fbgnsk pos_itvrlr tok_urfspk + +lem_zlnksr pos_qniadq tok_mnhifo +lem_gewmtg pos_iooisd tok_mkedwh +lem_vppxcu pos_mfhaax tok_cqjpij +lem_gvifwv pos_xkxkfm tok_hvbudb +lem_sladpx pos_qshzaw tok_ydnmrs +lem_bnopal pos_rxxfww tok_gmbium +lem_donmkk pos_vewkdf tok_nynzer +lem_yeemuc pos_nnurus tok_edvwtu +lem_xgvluo pos_mmkxck tok_ydmfge +lem_jutsbb pos_sqftce tok_vzawpp + +lem_jhqoqs pos_ihmyxb tok_yfyrea +lem_nkvlnx pos_aqvhio tok_nnelpw +lem_gxkkoc pos_daltlm tok_jorbqw +lem_ulrtdw pos_srnqjs tok_zfvikl +lem_uabzia pos_toagij tok_qxtcpy +lem_tzwyub pos_dtjufi tok_ixvpvt +lem_fusqdv pos_jtqthi tok_hzukad +lem_ndyycw pos_zvyagi tok_dpnsss +lem_fgoitw pos_wmaosb tok_sfkntx +lem_ujsciq pos_xshrpc tok_ebzffk + +lem_xutjil pos_adclgo tok_upezoo +lem_ixaanl pos_llfvjy tok_dtsfgg +lem_phwitw pos_pgioae tok_cyjwtq +lem_qmcwig pos_ecytfd tok_ufagep +lem_bbflme pos_iqjpts tok_czdsri +lem_deneph pos_igdlib tok_bmaizm +lem_ocnfid pos_saklbk tok_xokkhl +lem_geiuof pos_wstmda tok_nlcyec +lem_iwuxxw pos_dgzjoy tok_cdbnnm +lem_nvxuvz pos_uhodcc tok_dnvvwg + +lem_cqnsbo pos_pwadgr tok_kortuj +lem_anvqxk pos_yccvjf tok_ampwll +lem_nwvyas pos_tbydky tok_mrcayc +lem_cyptsq pos_iphsup tok_kztqgr +lem_fxsbex pos_npemmd tok_davhpt +lem_aekkky pos_hguizw tok_jpklwv +lem_nlprwl pos_pvinyd tok_gmdagq +lem_xxobjw pos_nkrsfd tok_bujtdl +lem_zxokri pos_uirync tok_rhnaim +lem_ehavud pos_unssnr tok_azrcug + +lem_laagva pos_wbmkga tok_plrqcy +lem_tnabku pos_tiindw tok_ogxljr +lem_cmbgsk pos_cdywvr tok_xcbwrw +lem_hnnnij pos_qrfymf tok_crwhuq +lem_iqzezq pos_ykndtn tok_irqwns +lem_skzyai pos_mxlyrs tok_wlurhe +lem_fnxwvk pos_bnnijw tok_bkkmji +lem_svgooe pos_krgptc tok_jrqtry +lem_assxwo pos_omdyzx tok_sgvsxv +lem_hvdgwz pos_vrgeow tok_tonrvq + +lem_ticbwy pos_snivqr tok_lyuygc +lem_dsmjsj pos_rbhzpc tok_wxixkc +lem_skclnj pos_wibgje tok_njlufj +lem_atelfh pos_gywost tok_jpbieo +lem_idetrp pos_mhvsob tok_abulej +lem_tcalax pos_nedduk tok_paoezt +lem_bncazo pos_amfnxh tok_jxdmpq +lem_nwdaon pos_zvwvdk tok_jtzspc +lem_kbfksu pos_gzgdoq tok_pkvgoy +lem_gmzvkk pos_irzvnj tok_cnseuw + +lem_tzlsch pos_bkjvqo tok_qqfszl +lem_fytfer pos_jlyhci tok_hljuwb +lem_fqzmxm pos_ucdwkn tok_bltjkt +lem_zlavqs pos_pnauav tok_fqliys +lem_upyjpk pos_ljgesp tok_ebbmgl +lem_goqobs pos_bmuenn tok_fncglz +lem_hudmem pos_kpbjpq tok_vimhsd +lem_fzifbh pos_aipmub tok_ntxgsc +lem_krzzbl pos_scfjmu tok_pbiemy +lem_ptegqc pos_doglkq tok_ipmpiq + +lem_klmyfi pos_occvyo tok_tshwre +lem_hikvdz pos_edbhdc tok_nnldcw +lem_ypdhrw pos_mtjrcc tok_mbcqcg +lem_fqksnl pos_jwfxnp tok_cypayv +lem_evcfsi pos_dttldp tok_webnbk +lem_ibyzvt pos_qewpgj tok_yrjlhg +lem_qkgrvm pos_ktptyv tok_kadqgf +lem_nxcqvc pos_wdlzbr tok_froxou +lem_nhvqen pos_kkfevq tok_survdh +lem_moouml pos_wzpyxo tok_bpqtpm + +lem_ojplcl pos_zyszct tok_zgmelo +lem_hgqjnp pos_ouflgq tok_fxjxpi +lem_djfjop pos_ziljar tok_xmpmcq +lem_jeoykt pos_xksutm tok_jfoyoq +lem_hequrp pos_objiod tok_xpcixd +lem_bjodxt pos_ccrnem tok_oqibha +lem_wbrifc pos_jlmhnq tok_xgyutd +lem_rdgoyk pos_cleaau tok_htrzdi +lem_ukwxbt pos_oxixux tok_fjwoar +lem_bffybk pos_qgiflp tok_orilxp + +lem_dzfpzx pos_mfzxlp tok_pehmff +lem_hfqgig pos_nkjvhi tok_ouofwp +lem_ssavei pos_yosrny tok_ptcybg +lem_peahtp pos_mzlfjg tok_pyiaej +lem_bthzlb pos_mtfqms tok_edqcrm +lem_tifmlv pos_svkguv tok_wnllfv +lem_iajvko pos_qihveq tok_zoeylb +lem_rnmfta pos_jqhmhr tok_qnfefw +lem_zbvoet pos_pyqtzr tok_jvvlfu +lem_vxiant pos_uollaq tok_hmlkab + +lem_pvzaap pos_kwjlav tok_hloelw +lem_ineheu pos_chtsfm tok_oriwno +lem_mhwkcz pos_wbjycg tok_ajjtyn +lem_dhkidv pos_mnrlqg tok_dydyzf +lem_uzerdx pos_uwtnpj tok_nfrasy +lem_kimtmg pos_dosyzo tok_ilmbes +lem_kfgynq pos_vrpcgt tok_hlybfg +lem_pnvdmh pos_fqinev tok_xfooor +lem_jsjzaj pos_risrfh tok_sdqucs +lem_jlaejs pos_psudfw tok_hycysr + +lem_qgqsct pos_ksspir tok_csbvjf +lem_rmnpfz pos_dyvrkq tok_vocljk +lem_qmappz pos_uhfxyg tok_wrldjy +lem_fdydmo pos_duwjng tok_esvrln +lem_tyifjh pos_ywyzqq tok_nnpkto +lem_rivwnz pos_ublpfh tok_vfvtpx +lem_qhxgdc pos_oewtgi tok_fugmmi +lem_ftfsgs pos_tfirgw tok_knunat +lem_chixfg pos_negxro tok_rdcjdx +lem_uzqrej pos_uvposr tok_uohnip + +lem_jxemra pos_rkbrbk tok_ifhxce +lem_olzhtv pos_jfrywh tok_yhvihd +lem_ydgohk pos_hszrqx tok_tlarpc +lem_zqglyf pos_ecedbz tok_lemvxl +lem_ypjaxo pos_zfqntt tok_rqbccq +lem_swxsyq pos_wlgmcg tok_gdkilr +lem_arpdhf pos_ghimin tok_logvyv +lem_xmzdmn pos_dmfrit tok_edstut +lem_uorbwq pos_sftqsm tok_sanfoa +lem_pczgzy pos_qoezvl tok_zaptlx + +lem_lfxiwb pos_djgaer tok_oivhxm +lem_gkojxc pos_emybwc tok_yvptil +lem_jkaufn pos_ohlrgi tok_izmquh +lem_bbwrkq pos_glsqyo tok_iuzqwl +lem_dnpbzc pos_qxptxb tok_qwyfem +lem_lewjlv pos_ywqhum tok_ixjyol +lem_mqqfli pos_wqsuny tok_etheob +lem_bmlwcn pos_gaxxdc tok_gxhrkp +lem_ogdatb pos_povwgi tok_bdhwqb +lem_ywllkd pos_reegdz tok_bhhlgo + +lem_nlllvc pos_ifwccy tok_hpofjd +lem_rvbtak pos_zkvflq tok_szpltj +lem_jxgesb pos_fiytll tok_mbrmdl +lem_otawdu pos_kjdjbp tok_pykwfn +lem_ijfzdv pos_ghayen tok_bwnkny +lem_zpvgcp pos_nlkqey tok_nwstwc +lem_snrvbm pos_yssbop tok_khlcdg +lem_tiesze pos_mtvjen tok_sdvzuv +lem_ctsgno pos_dawnws tok_pofuep +lem_hjnwef pos_fprfsp tok_lsjguh + +lem_fxbmid pos_cobxim tok_mekzbl +lem_ovmyis pos_mpqdzw tok_nijxgd +lem_ooscrj pos_bhnasz tok_efzveb +lem_jjrcaq pos_yrjlnz tok_pxoyxj +lem_zruscz pos_jqlsfz tok_xgkxog +lem_yetvik pos_havmzy tok_hpnpph +lem_viyswd pos_kjsboc tok_frehzz +lem_njrseq pos_nizvov tok_zwqtaw +lem_cyzlqz pos_iuvhcf tok_byybvp +lem_hatdhz pos_jxrypo tok_dcszlg + +lem_dzqrub pos_xcggki tok_qxdmyl +lem_jgsfzd pos_lcyjne tok_ymorme +lem_zrahry pos_ykjkgs tok_iuyyoz +lem_ujqjpc pos_mwkvez tok_zavudf +lem_ylyero pos_xkqvdv tok_kueygb +lem_akrtkf pos_upphpq tok_lozazk +lem_tynjxv pos_hnfisf tok_nlfpod +lem_aynhvz pos_jhfdez tok_vjwgrx +lem_dvrvoz pos_ghbton tok_eijeqe +lem_rwnckj pos_qxtbrp tok_hnymqy + +lem_yeezsf pos_edgtfl tok_huepha +lem_fnbmwa pos_xexppm tok_jnbhtt +lem_cdvath pos_neusbc tok_goeiev +lem_gzxbkl pos_wkwjrs tok_nrcybz +lem_aalrba pos_bziloj tok_vmhqob +lem_ubyqps pos_wgtscc tok_djigqo +lem_etcvry pos_viqkgx tok_ccwsez +lem_rwmldm pos_gpqhwa tok_fdodho +lem_chcwrl pos_srwglr tok_rvdfge +lem_hevyyu pos_kmrwse tok_jqykzl + +lem_dgrwgx pos_ekpepq tok_nbaaxm +lem_quehtr pos_pbtwtz tok_wtaeak +lem_xjacfo pos_jvycod tok_yklfvp +lem_tqpycj pos_ehfdln tok_exkbvj +lem_rqzwuj pos_cgcqdj tok_syzzow +lem_hdxrwc pos_nbhrgt tok_sxvjlo +lem_sfvquy pos_bevjtv tok_zetzen +lem_oexsez pos_sbasan tok_mdcsdi +lem_lnszds pos_hhfxuc tok_vlgtji +lem_miujhn pos_frqxep tok_gedpqp + +lem_saahbi pos_wxfkrp tok_tegsiq +lem_zzqnzm pos_vdemgo tok_prrcap +lem_aemczs pos_fhaeff tok_txgkqe +lem_kwhznc pos_syakvr tok_kogsme +lem_vwkozu pos_rijwtb tok_usowkw +lem_fnzqji pos_katfwf tok_kiuttf +lem_msucrn pos_lzhcej tok_jeiecc +lem_iheaua pos_retrne tok_iukohv +lem_hjrleb pos_eppbih tok_owrxoo +lem_juavaj pos_ukicbx tok_ycgtfl + +lem_prtjto pos_bzegei tok_uyblvv +lem_scseuz pos_titmtm tok_lhmzbp +lem_oixgqw pos_yyfmko tok_jhojkh +lem_uyeqjr pos_hljdgn tok_mljgaj +lem_ddxibu pos_akoluc tok_bovioq +lem_cehnmv pos_qwdqtn tok_dtvkvt +lem_xuxfji pos_zlsndz tok_qiogqv +lem_jnytve pos_gfkxlt tok_inlzat +lem_zfhbxe pos_xobyju tok_lcbacs +lem_sjlbas pos_ttfjul tok_sqkwlg + +lem_awjirn pos_ouhwax tok_mqsghi +lem_ofeegs pos_smxjgx tok_rfolii +lem_atswby pos_fenkvc tok_gqwikq +lem_pgxdiw pos_dsrwjc tok_tflanb +lem_qzhmel pos_rhztps tok_qluigd +lem_bvdbmp pos_zncjcd tok_vsuewx +lem_sgxcxc pos_epblbc tok_dcmpck +lem_xdvgiz pos_doaoyj tok_mpuqew +lem_odyrfg pos_nhhexe tok_pdzelw +lem_ylzdgt pos_csyilk tok_bppluv + +lem_ldleif pos_nqrjxg tok_llyips +lem_kymady pos_kcjbvp tok_xrcbnt +lem_kklwpq pos_wfievh tok_nyxktk +lem_yioeks pos_syomgn tok_nsymqf +lem_uopydm pos_ptjzva tok_giubnu +lem_hhyqgy pos_gvwdcy tok_kvkuks +lem_qrztso pos_clcsna tok_vaewey +lem_nhhwzj pos_ezgloi tok_eblrbl +lem_yyjamz pos_xercqz tok_jnosbg +lem_cojxgh pos_zingfl tok_wmpjhu + +lem_luxbuk pos_cxonar tok_cwplgr +lem_qatioz pos_dylzyf tok_pcapsv +lem_dzufzk pos_ytspme tok_gxditb +lem_yubxcy pos_uzwutg tok_sjajgt +lem_gcozwx pos_beyboh tok_kktcrs +lem_hanobv pos_qnsyiv tok_bsfxkz +lem_elkdde pos_wzrtnd tok_hoxyrh +lem_jzdimc pos_zkijnh tok_zlvbkr +lem_hmmmlv pos_lnlfea tok_jjbaaz +lem_tzqmkq pos_vjhhxt tok_ruslel + +lem_mjqeom pos_gebztq tok_aerazk +lem_upfusg pos_uymcnl tok_czdbnp +lem_hllsio pos_adizyc tok_qphapl +lem_awlgsj pos_fszhpv tok_sxsdzv +lem_xeuaam pos_ddjbrb tok_luddyr +lem_tpybvy pos_bbopwa tok_edwrrd +lem_vgoehd pos_xuypjw tok_efglsk +lem_zzfczt pos_vjatsq tok_emycex +lem_szcxpt pos_nojbqn tok_edhumk +lem_ispltv pos_tsgpnu tok_rpcvjq + +lem_mbkvhi pos_smppuu tok_pxndhj +lem_meivyh pos_gptann tok_sdjffp +lem_nxascw pos_thoscw tok_pzwxko +lem_xphiue pos_mubpmk tok_ddwxhy +lem_vkrwdz pos_eehypc tok_duejpg +lem_zjfyxc pos_zxizul tok_hdcxcl +lem_mulutt pos_oqcxhb tok_htymmg +lem_xjnxbv pos_hvvefp tok_waszok +lem_iiuvxl pos_ugokos tok_hajzte +lem_slueez pos_doxobb tok_scxheq + +lem_oqipqu pos_wirxjd tok_xoddxt +lem_wynqzb pos_gzuzum tok_yawpve +lem_lwighy pos_ghtfcr tok_hudyiu +lem_jrycuz pos_nlfuwx tok_mdjomx +lem_nzhlcs pos_vnbobw tok_lsiqpr +lem_djmiqw pos_dqnjft tok_ycwdbr +lem_qbiifh pos_zsejfw tok_iohtrz +lem_gquaos pos_vrrfeh tok_goglgo +lem_sljycv pos_ugcecq tok_mooblo +lem_ukztbi pos_fvqrwt tok_fvekhb + +lem_xyjecp pos_bngwhd tok_aahvsy +lem_gqtips pos_ujjrcw tok_aeeagn +lem_uzmhzh pos_sdrpev tok_yasdcw +lem_aifxsa pos_pgzaro tok_dvrxuz +lem_kdgtlg pos_nmbpkv tok_qcmfhq +lem_akitwl pos_iqemah tok_rupfks +lem_zrnmed pos_jrztme tok_wjvtlr +lem_vrbkpt pos_zrdrat tok_lyqree +lem_prncga pos_gprvez tok_hurmbi +lem_dlmgek pos_vkyrco tok_cxrptf + +lem_bxskvg pos_zmnhqk tok_tzdike +lem_lsnlte pos_coqune tok_zlbmnz +lem_wtdaxy pos_cmlzid tok_epvwwo +lem_qnvbib pos_qgrkky tok_fzesoz +lem_ccivwi pos_pmhcrq tok_wnrebr +lem_pykjmx pos_avqjes tok_qugjqm +lem_illjpl pos_ghcuqm tok_swogfp +lem_tdrfaa pos_sflkoy tok_kjkztg +lem_qwrgrd pos_nolwqs tok_jiihqr +lem_gfccmy pos_vuyicj tok_ocncea + +lem_tjgrnl pos_nqepwm tok_rksrmn +lem_jmtheh pos_ajlnmn tok_vhmszx +lem_byazwu pos_cduxqf tok_uwrpcg +lem_byikfk pos_gvgrag tok_juefzj +lem_kgrpzq pos_ahluev tok_stlvkd +lem_nyukqs pos_qyzolu tok_tstswn +lem_jomvhs pos_cdcarb tok_fxqfot +lem_unsenp pos_bflezv tok_wwyjkc +lem_zimsxz pos_xurwzl tok_rrjmkb +lem_qvjald pos_qdbrdt tok_cxzvzm + +lem_ewxsgx pos_ajgckx tok_beritv +lem_mpxaid pos_ylzbfl tok_aodlve +lem_fujsxz pos_ahphjs tok_bodsvu +lem_lifzal pos_nkzgmd tok_wqfhzu +lem_izvkug pos_cpczbr tok_eibvst +lem_ohggnv pos_ojhwiw tok_puhifc +lem_cyygzr pos_yumcfe tok_kdxdam +lem_gtbluo pos_qbvlct tok_djmsjc +lem_amjqcn pos_qsexly tok_rljvrs +lem_vshriq pos_kawwnk tok_sivtbf + +lem_dfqhkf pos_qjjksy tok_dcjjtj +lem_ivdeai pos_ablhhd tok_lbnksw +lem_eykqmj pos_achgqb tok_errdti +lem_cgfxgj pos_yeicru tok_gmcttt +lem_hfchhv pos_bbmgsg tok_ihxtuc +lem_pwcjmf pos_cktnuz tok_sgzbur +lem_vjrcwc pos_eliuud tok_opeihi +lem_vfeury pos_oabjwv tok_vtxnuk +lem_ojptmn pos_nmcckm tok_ieeqii +lem_zityox pos_plcaie tok_hjhqle + +lem_eicsff pos_uysimb tok_rhbfqq +lem_swawvt pos_zwcgjj tok_mpfzmc +lem_hxdyjy pos_uuklje tok_vzwcez +lem_qtugzm pos_rtagrp tok_wewuqn +lem_dgpeht pos_kowrmu tok_kdrmep +lem_ywgaxh pos_lrhasu tok_oyxara +lem_tjtpyv pos_qkftmv tok_rfocys +lem_yodrrt pos_kqfomy tok_hikxzm +lem_xqsoty pos_usrjwj tok_nzsrey +lem_vahsmi pos_jdhxzz tok_hzssny + +lem_sxqrgo pos_utfkbp tok_pgmuak +lem_upulmg pos_ogkzfn tok_vpwuna +lem_qxxzra pos_kzofah tok_mojdrd +lem_stteyb pos_whjryn tok_ahhjhx +lem_imlvws pos_xkjjyv tok_tpeqhp +lem_mkqbql pos_srhtfo tok_tjejdh +lem_srdtbv pos_chloqp tok_ssnijc +lem_uswkyg pos_vdicyb tok_xqjhmz +lem_krlhld pos_xgtbww tok_crcazr +lem_rgustp pos_hpmkfb tok_kbrimt + +lem_yztwhb pos_hxwygf tok_vmlqny +lem_qsljtl pos_okueau tok_uscadu +lem_sqcljt pos_yoeuuo tok_bufmmk +lem_vwdkvz pos_zvlxdc tok_iyfjxz +lem_fhyxld pos_iwjeqh tok_pxgswn +lem_nmyyde pos_rtgmic tok_rpznih +lem_nmryce pos_lqrnqi tok_oyzlal +lem_veooxl pos_dgbfni tok_xspltr +lem_yqdboa pos_wlbwhs tok_xykzxk +lem_egjpoj pos_poklar tok_iaongs + +lem_oqhmht pos_kokwdy tok_eysfod +lem_niayce pos_qtekzp tok_vzxvlo +lem_wpeowt pos_odrqay tok_jxcrtx +lem_ojjzcg pos_apfqio tok_izeatu +lem_hgumyt pos_fdqqzt tok_vjshjc +lem_qqxfci pos_tymmed tok_pluasb +lem_vjzegn pos_uljqtm tok_iaxkyv +lem_bsrgfs pos_wtqduj tok_krqfsu +lem_ioolrt pos_tafxue tok_egcrqb +lem_wlpoqm pos_mvcxyw tok_qhiusm + +lem_ijbaop pos_splybr tok_mqyixf +lem_laujzm pos_bzzmmf tok_faczbs +lem_wyagin pos_aztpls tok_jlxjra +lem_auhaiu pos_uiuvju tok_bkecxr +lem_dwkxfr pos_eqvcxg tok_ynnpma +lem_puxkvv pos_vxdutd tok_tjwtec +lem_ldvyty pos_msmwey tok_kmbshh +lem_iuqfhr pos_kumdhv tok_mlmqvt +lem_hbrwyy pos_zixmcl tok_zdgdce +lem_ilzezj pos_twxphc tok_yjnjwz + diff --git a/tests/test_data/generate.py b/tests/test_data/generate.py index 0047fc3..a79086e 100644 --- a/tests/test_data/generate.py +++ b/tests/test_data/generate.py @@ -1,6 +1,20 @@ import random import string +# Change one variable to regenerate a test file +generate_implicit = False +generate_clitics = False +generate_roman = False +generate_skip = False +generate_replacement = False +generate_disambiguation = False +generate_file = False +generate_empty_line = False +generate_sentence = False +generate_window = False +generate_capitalize = False +generate_generic = True + ROMAN_NUMERAL_TABLE = [ ("M", 1000), ("CM", 900), ("D", 500), ("CD", 400), ("C", 100), ("XC", 90), @@ -27,9 +41,9 @@ def randomString(start: str = "", length: int = 10) -> str: letters = string.ascii_lowercase return start+''.join(random.choice(letters) for i in range(length-len(start))) -# I am using different numbers of sequence size to be sure tests are not being right on another corpus -if False: # Make false to remove running +# I am using different numbers of sequence size to be sure tests are not being right on another corpus +if generate_window: # Make false to remove running with open("window.tsv", "w") as f: f.write("lem\tpos\ttok\n") for lines in range(200): @@ -39,7 +53,7 @@ def randomString(start: str = "", length: int = 10) -> str: rand3=randomString(start="tok_", length=10) )) -if False: +if generate_sentence: with open("sentence.tsv", "w") as f: f.write("lem\tpos\ttok\n") for lines in range(190): @@ -52,7 +66,8 @@ def randomString(start: str = "", length: int = 10) -> str: rand3=randomString(start="tok_", length=10) )) -if False: + +if generate_empty_line: with open("empty_line.tsv", "w") as f: f.write("lem\tpos\ttok\n") for lines in range(180): @@ -64,7 +79,7 @@ def randomString(start: str = "", length: int = 10) -> str: if (lines + 1) % 18 == 0: f.write("\n") -if False: +if generate_file: with open("file.tsv", "w") as f: f.write("lem\tpos\ttok\n") for lines in range(170): @@ -74,7 +89,7 @@ def randomString(start: str = "", length: int = 10) -> str: rand3=randomString(start="tok_", length=10) )) -if False: +if generate_implicit: with open("implicit.tsv", "w") as f: for lines in range(160): f.write("{rand1}\t{rand2}\t{rand3}\n".format( @@ -83,7 +98,8 @@ def randomString(start: str = "", length: int = 10) -> str: rand3=randomString(start="tok_", length=10) )) -if False: + +if generate_disambiguation: with open("disambiguation.tsv", "w") as f: for lines in range(150): f.write("{rand1}\t{rand2}\t{rand3}\n".format( @@ -92,7 +108,7 @@ def randomString(start: str = "", length: int = 10) -> str: rand3=randomString(start="tok_", length=10)+str(random.randint(0, 10)) )) -if False: +if generate_replacement: at_least_one_zero = False at_least_one_one = False with open("replacement.tsv", "w") as f: @@ -118,7 +134,7 @@ def randomString(start: str = "", length: int = 10) -> str: rand3=randomString(start="tok_", length=10) )) -if False: +if generate_skip: at_least_one_zero = False at_least_one_one = False with open("skip.tsv", "w") as f: @@ -146,7 +162,7 @@ def randomString(start: str = "", length: int = 10) -> str: f.write("\n") -if False: +if generate_roman: at_least_one_zero = False at_least_one_one = False with open("roman_numbers.tsv", "w") as f: @@ -168,8 +184,7 @@ def randomString(start: str = "", length: int = 10) -> str: if (lines + 1) % 10 == 0: # Window of 10... f.write("\n") - -if True: +if generate_clitics: with open("clitics.tsv", "w") as f: for lines in range(300): if (lines + 1) % 5 == 0: @@ -181,5 +196,29 @@ def randomString(start: str = "", length: int = 10) -> str: rand3=randomString(start="tok_", length=10) )) - if (lines + 1) % 10 == 0: # Window of 10... + if (lines + 1) % 10 == 0: # Window of 10... + f.write("\n") + +if generate_capitalize: + with open("capitalize.tsv", "w") as f: + for lines in range(500): + f.write("{rand1}\t{rand2}\t{rand3}\n".format( + rand1=randomString(start="lem_", length=10), + rand2=randomString(start="pos_", length=10), + rand3=randomString(start="tok_", length=10) + )) + + if (lines + 1) % 10 == 0: # Window of 10... + f.write("\n") + +if generate_generic: + with open("generic.tsv", "w") as f: + for lines in range(500): + f.write("{rand1}\t{rand2}\t{rand3}\n".format( + rand1=randomString(start="lem_", length=10), + rand2=randomString(start="pos_", length=10), + rand3=randomString(start="tok_", length=10) + )) + + if (lines + 1) % 10 == 0: # Window of 10... f.write("\n") diff --git a/tests/test_data/generic.tsv b/tests/test_data/generic.tsv new file mode 100644 index 0000000..322562d --- /dev/null +++ b/tests/test_data/generic.tsv @@ -0,0 +1,550 @@ +lem_pnglqu pos_asmebe tok_psaspz +lem_mebgbd pos_zvlbfv tok_ezxvuu +lem_xzqkny pos_mavawc tok_bxenxg +lem_ghngfg pos_pxvlyl tok_lvodeu +lem_ylhvck pos_iltuul tok_mmqtsd +lem_wybloq pos_jcxlix tok_gdukqy +lem_bghzga pos_jcgdsc tok_bsormp +lem_fxthyj pos_nibtuj tok_lnvnfl +lem_axspcv pos_btxixe tok_ppdtve +lem_xewmht pos_kvybim tok_fuhesp + +lem_myzkgo pos_lxynbd tok_cpjqfb +lem_ojtejj pos_guuaor tok_fvrhab +lem_mmukhu pos_mqfvnc tok_fnwube +lem_tnsgmx pos_anxfmz tok_ktfmas +lem_tmluif pos_agcygo tok_gtksyi +lem_iiudev pos_wsuupd tok_fnftzx +lem_tsytjo pos_yecchj tok_gzaqyb +lem_kbrglb pos_jrsyty tok_lhonek +lem_dbvtjj pos_xuqywg tok_izhkrh +lem_ehymrj pos_jconao tok_gimfgp + +lem_qoxvdx pos_xelnad tok_fbzqzj +lem_yvdvaj pos_xfysrm tok_lfcgll +lem_ygqkqo pos_pmbbbm tok_tnxqzf +lem_lgcvjs pos_wkecbv tok_ecwrkb +lem_wlwibu pos_mxabba tok_howeal +lem_qezoyp pos_ehsvhq tok_aehlox +lem_gmgvbl pos_xjywci tok_ndleah +lem_dukmse pos_boqhzt tok_jyqpha +lem_yncpeu pos_qmdcur tok_luvxba +lem_wccpzf pos_dwuage tok_gsfpvu + +lem_zgaeuh pos_iebabk tok_vjpipo +lem_egiwli pos_pneaav tok_lwncsp +lem_etdddt pos_dfslho tok_tcvxup +lem_piujkq pos_jbtkzg tok_ocawyd +lem_nvowjp pos_zzflcu tok_mgwyig +lem_dzbokl pos_bgtmyq tok_btcxaa +lem_uvunqw pos_mjgnwg tok_rmjtta +lem_iiggtz pos_mpxicz tok_bziitq +lem_oqkxln pos_stzzex tok_epkfvy +lem_vaglxh pos_jqljvu tok_jhcyqp + +lem_zrdaut pos_azcbgd tok_eqspfr +lem_pahaia pos_islakf tok_vadzpo +lem_nrgboa pos_ljxzgm tok_dxbdmx +lem_rdipvv pos_mdnylf tok_cuubbc +lem_oshqgz pos_xgwkmi tok_gksedp +lem_aeylbl pos_vntgwy tok_ujfpcr +lem_kvzcce pos_cliawz tok_lsynrz +lem_jrwgfn pos_clhgqa tok_ilruyc +lem_mvjszq pos_agbbcd tok_clcciy +lem_aupvma pos_wwlmdh tok_xoalkn + +lem_xnlhfr pos_rhevie tok_zekaok +lem_dhehjw pos_aurleh tok_nmakyf +lem_ankaqj pos_esuowx tok_rjltim +lem_vyugto pos_ztvzxd tok_nkzhsc +lem_uetvbf pos_dixkyh tok_gpxwdy +lem_wgjqvl pos_fbijwc tok_uhpgmb +lem_lbmsla pos_qzipnp tok_qwqcyy +lem_rdenyx pos_smgvkc tok_oojuer +lem_hiwtug pos_osgiia tok_kgrbky +lem_xvlgbu pos_gbrncm tok_rhcbnf + +lem_rrijyv pos_hhwgaj tok_hglsmo +lem_mhfrtc pos_gseabj tok_gvbsrc +lem_uiljox pos_grhpem tok_fkrfzf +lem_jmrrud pos_naxznl tok_suyxme +lem_whypap pos_nobacl tok_cpbqgd +lem_msajlz pos_pyoxmm tok_wfjhwj +lem_ettsyw pos_gbmuzc tok_lapskd +lem_gujqny pos_hcrckt tok_gekwdi +lem_mpagjz pos_mbpjcl tok_dfwbdx +lem_ldreti pos_lkiypk tok_kvcurb + +lem_vnypun pos_gkjdef tok_zqopre +lem_yhzsnm pos_lowabu tok_ifurtd +lem_svgolo pos_yemjht tok_mozohp +lem_hnncpk pos_jnpizv tok_uujfmo +lem_pbxxgy pos_rwmksa tok_tbgelf +lem_alxytf pos_nmrber tok_trwajw +lem_rcmzct pos_dnbxuc tok_zjgbht +lem_slzeww pos_ueibpj tok_spnicz +lem_oatvra pos_jybjfq tok_anaayc +lem_zdpeyz pos_cebrvz tok_jthzal + +lem_bntofm pos_jngleg tok_qdpsqo +lem_fmrnyq pos_tveybk tok_qywgjg +lem_jfzdao pos_efkukb tok_terush +lem_msnztv pos_avckvf tok_rpdgdm +lem_vlqlqf pos_vcmecr tok_psjooy +lem_fjbpev pos_wtdmbr tok_duslbg +lem_wieira pos_ydinmw tok_gxpvdk +lem_cghfnf pos_nlgplz tok_yhqkbd +lem_rludli pos_awqmwk tok_rvkvog +lem_msfyup pos_niipwh tok_xysaww + +lem_xvlzpl pos_npdsdr tok_hozwpw +lem_ezwduk pos_bopbho tok_ojphac +lem_gyjwdv pos_bulihx tok_ybwnla +lem_fxfuvg pos_gwrsht tok_iufari +lem_hvhhvu pos_bfvbqu tok_wrugav +lem_virrik pos_bfusid tok_osubgh +lem_igtllm pos_zsmlgd tok_vrguwy +lem_yvxfyo pos_foicpg tok_omidwp +lem_onshyh pos_huzxvu tok_xoqpol +lem_twcwrd pos_xmmyii tok_apeqno + +lem_hwlcbf pos_wvxtpu tok_cwstqs +lem_idwhly pos_uxqvdq tok_wadnqq +lem_ghbdfe pos_tdbmiz tok_hcxois +lem_bfsrey pos_yatuxi tok_uoyzjy +lem_nexzlq pos_rnxyvz tok_xvruyn +lem_hobbrr pos_piiaix tok_uniyqj +lem_lpvloc pos_axnzeh tok_zatdex +lem_wczgrh pos_upuwir tok_nqbrli +lem_gosogu pos_kkbarj tok_xizwwg +lem_oeprnt pos_olivpj tok_clepiu + +lem_omqxtp pos_kehvtp tok_hrerte +lem_pnowgn pos_etrjvq tok_mlwjjy +lem_zshfka pos_bjqaoy tok_kjwvbg +lem_rjxsbn pos_pwonvp tok_ssslob +lem_fwxdnl pos_kujnyj tok_gcqhia +lem_rdaszr pos_psouih tok_gjshkw +lem_qczbdv pos_lyfkes tok_judfss +lem_usczau pos_buqpls tok_pkjdmx +lem_uytagi pos_uqntms tok_lrjnln +lem_rwgfjv pos_peezza tok_dukgtz + +lem_lyyhwq pos_iwhhxe tok_voswhx +lem_sewohc pos_eszmdr tok_mzgiqc +lem_fzvavh pos_xfnjdb tok_ytuboz +lem_ujyrdr pos_hmmeaw tok_emmmzw +lem_qfdkqh pos_oakuls tok_zjaurs +lem_iglmrn pos_moqchd tok_uebens +lem_grrils pos_xnvdit tok_kxkccr +lem_rahztc pos_kvfbjb tok_klpafn +lem_oujgyi pos_iskgln tok_vcxpps +lem_mdrbdm pos_icrxxn tok_astqym + +lem_zvymaz pos_nqchpv tok_bdxugf +lem_doafwx pos_eeensz tok_ijygwa +lem_ecnrtu pos_kdivkj tok_ewgxfw +lem_vlzbui pos_ykyihk tok_logkvy +lem_wljjnn pos_ejrqux tok_qbswct +lem_jqktac pos_mvpkvz tok_rfyqtq +lem_miucap pos_emmvel tok_blubai +lem_qkspdn pos_lagrww tok_mfacah +lem_pyyoag pos_yixzbw tok_wsqjlc +lem_fahftc pos_yebico tok_ohzfke + +lem_vbnhkb pos_mkdjjo tok_sifeia +lem_tfythp pos_wxxaqg tok_bhvhgg +lem_qhekoq pos_hnqqgd tok_lkfkvl +lem_juynha pos_zwyxbj tok_vmewqc +lem_xhvadu pos_tsubsg tok_ejaswn +lem_nwacfk pos_avqxhm tok_dxzihi +lem_mlkidd pos_xtnfei tok_tvurhr +lem_mjlcuz pos_fsxxuf tok_tvxcyq +lem_elurab pos_fkeuji tok_kodcqh +lem_jhtvfa pos_riwebj tok_wyfllm + +lem_ehyuab pos_azurxz tok_lzhzlr +lem_hesbez pos_cuhpuz tok_ovolhm +lem_vpwjcg pos_ejgsmo tok_yabopn +lem_bdcgse pos_bzuojo tok_qdxkdx +lem_sajlgi pos_uqxyji tok_ffmpgu +lem_atdhzb pos_renqzu tok_plhuik +lem_wycrgt pos_ptrfbb tok_fcyifc +lem_idvbho pos_ucnwzy tok_qpqnlw +lem_ghsmbz pos_fatumo tok_kbffal +lem_jcynpn pos_yltenm tok_kczrkz + +lem_dvweer pos_krdjep tok_yguwcv +lem_mudjnq pos_uviakt tok_murppk +lem_wjukub pos_yggtkn tok_zbdcjc +lem_znvoew pos_zgfwer tok_vqpnit +lem_gdaykn pos_abktqv tok_oytwun +lem_jukrnn pos_pkuqyu tok_jsyslm +lem_tzlegg pos_kmoxfo tok_jnqtpq +lem_ffpebz pos_cxabab tok_olouxz +lem_ljlbis pos_gvewxv tok_hvhgut +lem_yvhmck pos_bmtppc tok_upbyeu + +lem_gtpqpp pos_vgukly tok_bkxnlu +lem_wkpmqw pos_klxemb tok_cgkcqs +lem_mprrnf pos_zwwksf tok_lldawn +lem_gjstsp pos_lyktlo tok_eqdswb +lem_iawrhv pos_jnxifc tok_jcoyys +lem_gikbbb pos_hcfykc tok_jhbaaj +lem_rworir pos_menbpu tok_lpthvx +lem_idxscs pos_msnpth tok_hjydha +lem_gxkksx pos_hgwwvg tok_ambhzx +lem_wrurbc pos_deytsp tok_bsdcgw + +lem_ciifzj pos_rwklqg tok_annfjf +lem_hxlapr pos_ytijwd tok_uwfoew +lem_dakvmb pos_dueirt tok_lkiyjy +lem_orwffp pos_xvpdsd tok_ymjtmn +lem_jowzgo pos_fjxiec tok_mtndsk +lem_ccsjxk pos_mnynch tok_icfdkp +lem_bckheg pos_iuwwfb tok_gbsnmi +lem_qnrtpk pos_nkaibd tok_zptrjo +lem_ylyodz pos_qusyxo tok_rzoafu +lem_huscow pos_mkjxjo tok_hvpcub + +lem_isigco pos_abwiiz tok_zcjhzt +lem_gctpta pos_vlwfme tok_rciavv +lem_rhutpw pos_dqbkuz tok_epwnzx +lem_nokcra pos_bqhndh tok_gawuji +lem_excjus pos_vjznov tok_wfysra +lem_vztfoz pos_qnpmdl tok_darejq +lem_nuxexn pos_kiieeg tok_gklnsq +lem_tqzdhk pos_iynvos tok_tfvtlx +lem_opwkfs pos_rrpisf tok_nfvfru +lem_pyqpjx pos_txlaci tok_qbuzxu + +lem_jlhbyd pos_ngsqnp tok_zebvyu +lem_fskqwz pos_ueoqsk tok_lincdm +lem_degete pos_lpojkc tok_twqmod +lem_agmtcx pos_twimub tok_nfiavc +lem_cjwuzk pos_oxtsip tok_bbeimy +lem_fyawqu pos_kyewnd tok_zolwky +lem_bysedi pos_glratz tok_gwayzf +lem_gnlioz pos_iwuvdv tok_onsdpz +lem_cpydbn pos_mibnmt tok_ebxmia +lem_bcfyxt pos_pxcgsc tok_sfjfzn + +lem_uytpgs pos_yoikuk tok_ctkvjv +lem_iecopt pos_wjfxqb tok_egrvxp +lem_uuxivj pos_bzxynj tok_vmgvrq +lem_vwiknd pos_wcphdo tok_puzlfa +lem_sikxax pos_jbuzzz tok_lsomuh +lem_hpexss pos_noxjaa tok_ekffht +lem_cuvsiw pos_hfknjd tok_yeppee +lem_jzxaay pos_zhsbsm tok_ulcoos +lem_ktchac pos_ljzhrp tok_xsfahv +lem_nwbqio pos_kunfjr tok_twpklh + +lem_algfvf pos_ooxfml tok_ktfqvq +lem_qmdtds pos_eaautl tok_dbsdhv +lem_khowpj pos_jvoqzc tok_fzgxgz +lem_wkpwee pos_ykrfap tok_plwcmc +lem_vytmpx pos_jyysyv tok_thcmet +lem_uotpzs pos_ywwhld tok_mcfcnl +lem_eaofao pos_lakpdh tok_vorgrd +lem_rgcrnu pos_ppzwsq tok_fxqopo +lem_dipjtd pos_ncihjq tok_ltkzwj +lem_vjjtnl pos_yggydp tok_nqxhll + +lem_ydzust pos_nljkuc tok_amqnhf +lem_tyykgb pos_rqbuvy tok_zedoun +lem_sbanpt pos_ugeqel tok_ggerie +lem_fhrxzr pos_zvbfxl tok_nqhlhi +lem_uzmbex pos_pvonjh tok_opyqza +lem_brjisl pos_nzzgco tok_zntdob +lem_aitmgy pos_twvcov tok_zzlpky +lem_dftehu pos_maytuh tok_ewfbks +lem_iyipca pos_yjoihd tok_cizbri +lem_crwgij pos_kjcdrc tok_pcmcaf + +lem_zkgula pos_nqngve tok_qljpsw +lem_xgvola pos_ggnrjm tok_txuwre +lem_lmvodr pos_cateld tok_piyxeo +lem_hkbfmb pos_hbladr tok_kevjhg +lem_tpvral pos_apwgxt tok_tnpipv +lem_dplpey pos_labudw tok_jlxdil +lem_wsjnft pos_yrsptw tok_zmijcd +lem_guvspa pos_tkpzap tok_xkzfet +lem_udchdk pos_jxbyjy tok_jyubyl +lem_woubce pos_ygdkii tok_naxuga + +lem_jxhmdx pos_pecvlq tok_lctaho +lem_soswkd pos_xxzgwj tok_zxfrkj +lem_lmbzbz pos_woeygv tok_egvriu +lem_nnpdvu pos_tdzlzl tok_ihvkzw +lem_lzjtpd pos_cbwxri tok_ktupgh +lem_poxgkh pos_pgiuvq tok_jtdwrs +lem_tdtcgu pos_xrcwcn tok_imzzpy +lem_zsvhuq pos_tuxjki tok_onneiv +lem_cgawuf pos_aswqfa tok_hpdanq +lem_dkjtaw pos_bufaos tok_bmejgh + +lem_lndkha pos_bjxvvx tok_vjpywh +lem_cgwcqb pos_pqzsni tok_arhatp +lem_rxsvoc pos_eevidh tok_kasnfm +lem_iuiyhm pos_dehgbs tok_zvrwvr +lem_alnoro pos_xbiwuh tok_lmwuex +lem_ndpkrs pos_evinyq tok_gozthy +lem_cfloip pos_iykkfq tok_vfggug +lem_puwsps pos_cavuxb tok_ojlpen +lem_cvbazd pos_drtwjs tok_tdpfhx +lem_xaadjv pos_xkmxfh tok_knvudq + +lem_pttysa pos_xfmfja tok_erdylq +lem_tmhaob pos_eebvst tok_yocmuk +lem_iqwvzf pos_tbnxrw tok_cwnfsq +lem_ksgmdv pos_dphlbz tok_wgsina +lem_luhtpp pos_obecbu tok_alswtw +lem_avbqso pos_muzxbj tok_narjmi +lem_vtripz pos_rmwpdt tok_ccncei +lem_qiisvu pos_zobxzd tok_jvitkg +lem_thuxyf pos_ziagxk tok_kiwher +lem_yunaef pos_bkgvxi tok_uwdpvh + +lem_hkvyaf pos_ilupbo tok_ffepqv +lem_dtxwau pos_qmieqe tok_rpwihf +lem_zfqxvf pos_pfckzl tok_byiame +lem_nqhasq pos_wmncaz tok_eiyncu +lem_osdxsa pos_ivzmal tok_xakyhq +lem_nbhrce pos_annonw tok_lwivqn +lem_dtjuky pos_xwdjjx tok_ududim +lem_tglzqo pos_pidjzx tok_lvymnw +lem_yhcpqk pos_bbhgdh tok_blyybp +lem_gdkhkf pos_arsxhu tok_sprifv + +lem_udognq pos_ssvewy tok_pgnozm +lem_mkiuvw pos_mgpdhc tok_mmshwd +lem_fyubbp pos_johwvl tok_ndfvgs +lem_srkpmu pos_kciexw tok_ybiiof +lem_thuvck pos_disjgx tok_glmmva +lem_arpojd pos_lqhzix tok_mpwayz +lem_pajszf pos_plwzrp tok_mfvehu +lem_fiobxw pos_bmjtzg tok_dstmev +lem_hwrxka pos_pmfuin tok_ucysgx +lem_bvyhch pos_ueiehv tok_ubagoz + +lem_udgjbg pos_bppiow tok_sxvxcq +lem_umkvig pos_wqgkvj tok_zgudxp +lem_ahqwdc pos_jtxlag tok_pbahrd +lem_qphmfc pos_bugjkt tok_gssjyy +lem_svklvx pos_roqynz tok_tdwhnk +lem_qmzrjk pos_wsztno tok_apttqt +lem_deiapn pos_cjzjph tok_zvejtl +lem_fhwkce pos_suwufb tok_hxinbh +lem_imtorp pos_kuleta tok_icovji +lem_weovxt pos_vdxwzf tok_gtcoay + +lem_qyglsd pos_mdmrnr tok_hjhtdd +lem_jyygdn pos_ryvyfq tok_fupiof +lem_okdbuv pos_pqmugs tok_xrnwfs +lem_hwgsim pos_edglig tok_thojbh +lem_ctnxbc pos_ilplpu tok_okedji +lem_ldhlea pos_syjfsv tok_cssiqu +lem_czqjry pos_qoxlhi tok_glfpbn +lem_ibblvg pos_mlapuz tok_icmypi +lem_whofjg pos_qzxezo tok_bdtqyr +lem_ksvdnm pos_ianhmx tok_byqpnq + +lem_himgmt pos_vsfytf tok_gtoyzp +lem_yzbvgk pos_hactyl tok_mihsrx +lem_dxyxpo pos_vqrmcs tok_afdnmp +lem_zygnah pos_hguzgi tok_gybiwc +lem_orlilu pos_voyadg tok_bbahtj +lem_jihriy pos_zzjdfd tok_uhbjjl +lem_jxxbqe pos_xmwjpi tok_exipxk +lem_cwclce pos_ttlmmc tok_qahigp +lem_yqingd pos_khxetb tok_yseugj +lem_fdqudy pos_vtqqka tok_bltzio + +lem_ufvglp pos_usbhln tok_invhmt +lem_ucbcws pos_juxwyh tok_xvidto +lem_jalnrv pos_kehdve tok_xlrwvm +lem_wupgkx pos_ytjocr tok_mhcpcu +lem_ysaiwb pos_zrbkmf tok_dhbygu +lem_gnheia pos_sctxya tok_zvbkea +lem_llgqzk pos_xpprym tok_enloag +lem_rhztyg pos_rdgqdo tok_azjqdx +lem_ibddtk pos_nvaxik tok_lclhya +lem_tpvovd pos_yhzvwj tok_icyzhn + +lem_nswoot pos_bwvdze tok_jtzcuc +lem_tlvsms pos_anwnhu tok_gsfzrw +lem_knrvmq pos_nldxjb tok_tziphw +lem_khgyeh pos_oikzss tok_ayhvin +lem_mkauym pos_enawim tok_tsmsus +lem_oxtqxo pos_wlosqk tok_blffyh +lem_udbsej pos_alekho tok_axfaat +lem_kgjfxi pos_cbsgst tok_zflmoz +lem_ohwipy pos_navjcp tok_rodrfj +lem_miketf pos_tpkwtu tok_lwgjsx + +lem_jabgep pos_rdpekf tok_plrhxh +lem_epimnv pos_wjilvk tok_hpuobc +lem_anvpfz pos_srwwtk tok_otyntq +lem_gnoqea pos_msvort tok_ctwezz +lem_erqzii pos_enojdo tok_opfqdy +lem_tpwncs pos_paskoj tok_cesuln +lem_efzrwp pos_tosvwq tok_sqzwyc +lem_uqecmi pos_psgrbf tok_uytozr +lem_oecpmd pos_lopuif tok_grtwkh +lem_gpbnfh pos_rmsdkd tok_imwchk + +lem_hdhfho pos_qpkakd tok_qiijyh +lem_rgtuow pos_dmyxdb tok_mbpktj +lem_xwuxrs pos_qbdcig tok_lhzyau +lem_rdhbzp pos_upvmfn tok_bfzjnn +lem_thycqp pos_sxvtex tok_jgfpgb +lem_nwuipp pos_gojwjp tok_ojmdev +lem_pclorh pos_wycfox tok_tdgvwl +lem_hhrpjx pos_vpofjw tok_svboji +lem_zhiugr pos_ssbdgn tok_jzfhje +lem_dgxepp pos_aqvvkq tok_aophlm + +lem_djcgnc pos_umfujg tok_jvctze +lem_ivtuib pos_gccaun tok_fyvvsa +lem_xrldqk pos_tifgzp tok_qemkvl +lem_fgcwen pos_updhex tok_gqwrty +lem_jfjktp pos_smlgbr tok_supebj +lem_qkrnfh pos_klflpm tok_cgqbzr +lem_hqjwgr pos_cscpfc tok_tmcjwt +lem_zekjfs pos_xioike tok_jofgaj +lem_lbdjep pos_jkawyi tok_xbkbuv +lem_qccvpl pos_mqjgkg tok_prxlbr + +lem_lqbcxe pos_marmcp tok_mvmxoj +lem_hhfrfe pos_upowxv tok_btqcjb +lem_kxnbox pos_vlfamx tok_gezizk +lem_yjteio pos_hgdikt tok_cczjyb +lem_pyrppz pos_ziuyfg tok_kpyiia +lem_xxatad pos_abteyo tok_ahluuw +lem_rnqkiw pos_ilnrbf tok_mmghro +lem_btfstz pos_bisgck tok_oqlppr +lem_idxuwc pos_xdmidv tok_axlwpx +lem_bytmdu pos_nltcuk tok_scnzqo + +lem_epyhkc pos_iixjck tok_eqxaco +lem_nozaxl pos_uwxufj tok_nffdpa +lem_ajycum pos_cariev tok_ptdrop +lem_huejvt pos_ywpqdr tok_zsqdxa +lem_kdpogh pos_xiwzrj tok_cqvpdj +lem_qnmbsb pos_xykqzu tok_jxriow +lem_oendok pos_emkvie tok_spxunf +lem_xatswy pos_gvdzfv tok_paisus +lem_yhjped pos_dveoqj tok_lmogyx +lem_rjcuzx pos_cgjnsm tok_kkmfce + +lem_dpkafn pos_iisyjw tok_dbdgpw +lem_oeigbc pos_mhfybc tok_edswto +lem_cewjzn pos_cdympu tok_dydedt +lem_clclzd pos_ywtnbn tok_lwhddh +lem_zuxviz pos_xaexkk tok_oiekam +lem_yadubx pos_afjngs tok_keyojz +lem_qwrqfx pos_xfonvq tok_ajzkut +lem_rpdtbc pos_agjxmq tok_bbrfdk +lem_vyvsec pos_mygmil tok_hdllgn +lem_otqfxh pos_dqnkst tok_jkayri + +lem_tmfzdg pos_awwegn tok_hqgbnu +lem_ogmywj pos_tdzfkg tok_kabnzk +lem_sdehke pos_xlysst tok_zmzjes +lem_yggffr pos_aihjwe tok_bkyzyd +lem_lnpsgh pos_pukddo tok_taigpe +lem_tqpkkq pos_qobykg tok_ifpuye +lem_frawzk pos_qgztxb tok_wbbxgh +lem_qgmyqi pos_uaegvo tok_svjheg +lem_yljbao pos_cpnego tok_zjurjx +lem_nondda pos_ogapcg tok_uwnumi + +lem_izzxzv pos_ecnnyc tok_xoryrx +lem_xspbqn pos_duhumn tok_ilopbl +lem_eosiwr pos_udotyh tok_yxhhhj +lem_thzxnm pos_gkbxwa tok_tpfjrw +lem_tywvba pos_tekyht tok_jokgkd +lem_gpnqba pos_ndzpcl tok_bmxyks +lem_eyclfq pos_qstzwe tok_kwdged +lem_ryxiar pos_pfurva tok_vfwhix +lem_zbluyg pos_hqolyd tok_jdmtnh +lem_oimpxn pos_dououy tok_mqijzw + +lem_qfrejv pos_diuani tok_lifrov +lem_bwhido pos_wqnomk tok_aeewci +lem_ofwfmp pos_kagjkt tok_lkqklj +lem_tqyuzs pos_salzpj tok_xasuvv +lem_imqbao pos_hbjkmj tok_ncbrvm +lem_zyrhtv pos_hqonsn tok_tgaidu +lem_xfsksc pos_zaoeyb tok_jvemap +lem_cocsqq pos_iwxbbk tok_jowofp +lem_dcnzdm pos_spcvwy tok_uhjbnq +lem_zudmze pos_fypyww tok_umyuzz + +lem_ucfaba pos_fdjzot tok_zbhmdd +lem_kbmwda pos_ophkmd tok_tuiafv +lem_fyjlbt pos_vxjxbm tok_bedifk +lem_wubgiz pos_snuzfx tok_guwjbi +lem_fpduuj pos_zzgfkk tok_tqskbu +lem_xmxwno pos_svzizv tok_khdyci +lem_nfdqbr pos_ptzmhs tok_xphjhk +lem_lewrig pos_dfgtjx tok_vbtvjf +lem_ttyfwj pos_iwcucd tok_uybvvd +lem_dthqvz pos_mwlofb tok_fmwfis + +lem_pqbthq pos_rvtmlk tok_jvnsnk +lem_lzvnww pos_vpxejg tok_vukmwi +lem_dnllbe pos_kvgegz tok_qbsvin +lem_wnzluu pos_zqpjes tok_fzbrae +lem_mgggxv pos_jztwzk tok_pugngw +lem_kgkmwh pos_avytiv tok_vxxpqq +lem_ztqbjg pos_dfntss tok_vjnmmg +lem_vzrzhp pos_lssare tok_lfbuwl +lem_mjsuko pos_uyvybl tok_wdwwvp +lem_nawtkw pos_igxvzt tok_otwnbp + +lem_nlanfn pos_hlrigm tok_plhxfs +lem_gqwksn pos_zcnxuy tok_fuxjiy +lem_qzihsn pos_ejoxpr tok_whpuxl +lem_kdosgv pos_fvzwxx tok_rkgdfb +lem_yuiyxw pos_dmbjzv tok_vzrqqy +lem_okiere pos_aqjfpz tok_ssvyzp +lem_hjwthb pos_kpkaen tok_jpjemk +lem_zhfyyx pos_vgqepe tok_chiprc +lem_dnxibl pos_scliog tok_maaums +lem_slgmzy pos_kpfqri tok_vaxmde + +lem_saxbeh pos_lzupsf tok_enlbne +lem_ytebof pos_nzyxsk tok_seizwi +lem_axienz pos_ejwaro tok_yoaijm +lem_fyxwoy pos_jpovml tok_kqbuva +lem_cqoppk pos_frghuo tok_xoyduc +lem_dshhqe pos_zzolcy tok_bpbszd +lem_gyirrf pos_wfztqk tok_htkzmr +lem_kbiqrd pos_kssngk tok_ufdahd +lem_zqdegm pos_qvhoqu tok_avllty +lem_qlvfdy pos_ydszro tok_wsotit + +lem_xgtftk pos_yqdcsh tok_eytugv +lem_zejrgy pos_ojcikn tok_utwapj +lem_ykyiif pos_fhptrb tok_dkjucx +lem_quepzd pos_kqszqm tok_aqumhv +lem_zjgvew pos_xygxzf tok_nyzgtf +lem_crtmnw pos_ppctbd tok_ybroqp +lem_vdevvf pos_oufyms tok_ljnoye +lem_xbprju pos_pswwsd tok_tmvzzc +lem_fooalk pos_xdgvxe tok_taodtk +lem_fksxtf pos_mstpaq tok_xiudaf + +lem_zjoakx pos_dvmacz tok_nhkgol +lem_kgylhm pos_xuhevj tok_qwtmly +lem_lrvbkp pos_xaxdvl tok_kfacse +lem_sdzqrk pos_ftylmt tok_ikkucj +lem_xtocvu pos_jgzxuw tok_zigxwf +lem_ntpfcq pos_gagsxh tok_zlwyba +lem_jdvitr pos_uxnniv tok_hnjwqs +lem_vnnrfi pos_hnytho tok_afwlas +lem_syohyi pos_hbrlqj tok_gehdre +lem_axzlfm pos_qmvhli tok_llrjke + diff --git a/tests/test_postprocessing.py b/tests/test_postprocessing.py index 47e335c..3f6e38d 100644 --- a/tests/test_postprocessing.py +++ b/tests/test_postprocessing.py @@ -2,6 +2,37 @@ class TestPostProcessing(_TestHelper): + def _general_config_write(self, postprocessing: str) -> str: + with open("./tests/test_config/generated.xml", "w") as f: + f.write(""" + + + +
+ token + lemma + pos +
+
+ + {postprocessing} + + +
+ 2 + 0 + 1 +
+
+ + + +
+ + +""".format(postprocessing=postprocessing)) + return "./tests/test_config/generated.xml" + def test_disambiguation(self): self._dispatch( output_dir="./tests/tests_output/", @@ -142,10 +173,162 @@ def test_clitics(self): self.assertTrue(line["lemma"].endswith("η•Œne"), "Clitic has been passed to lemma with glue") self.assertFalse(line["token"].endswith("η•Œne"), "Clitic has been passed to token without glue") tokens += 1 - print(line) - print(tokens) self.assertEqual(tokens, 300*0.8 * 0.8, "There should be 80% of total tokens, and 20% of that should have been removed (2 clitics" "every 10 words)") self.assertEqual(clitics, 300*0.8*0.2, "There should be 2 clitics for 8 words") + + +class TestCapitalize(TestPostProcessing): + """ Check that capitalization are dealt with correctly""" + + def test_capitalize_base(self): + out, config = self._dispatch( + output_dir="./tests/tests_output/", + train=0.8, + dev=0.1, + test=0.1, + config="./tests/test_config/capitalize.xml" + ) + # Checking all corpora just to be sure + tokens = 0 + + sentences = [[]] + for line in self.read_file("train", "capitalize.tsv"): + if not line: + sentences.append([]) + else: + sentences[-1].append(line) + tokens += 1 + + sentences = [s for s in sentences if s] + self.assertEqual( + [True] * len(sentences), + [s[0]["token"][0].isupper() for s in sentences], + "The first word of every sentence should be capitalized" + ) + + self.assertEqual(tokens, 500 * 0.8, "There should be 80% of total tokens") + + # Test half + + def test_capitalize_random(self): + conf = self._general_config_write(""" + + + + + + """) + out, config = self._dispatch( + output_dir="./tests/tests_output/", + train=0.8, + dev=0.1, + test=0.1, + config=conf + ) + tokens = 0 + sentences = [[]] + for line in self.read_file("train", "generic.tsv"): + if not line: + sentences.append([]) + else: + sentences[-1].append(line) + tokens += 1 + + sentences = [s for s in sentences if s] + + half_toks = round(500 * 0.8 * 0.5) + half_chunks = round(half_toks / 10) + self.assertEqual( + sorted([True] * half_chunks + [False] * half_chunks), + sorted([s[0]["token"][0].isupper() for s in sentences]), + "The first word of every sentence should be capitalized" + ) + + self.assertEqual(tokens, 500 * 0.8, "There should be 80% of total tokens") + + def test_random_caps(self): + conf = self._general_config_write(""" + + + + + + """) + + out, config = self._dispatch( + output_dir="./tests/tests_output/", + train=0.8, + dev=0.1, + test=0.1, + config=conf + ) + + tokens = 0 + sentences = [[]] + for line in self.read_file("train", "generic.tsv"): + if not line: + sentences.append([]) + else: + sentences[-1].append(line) + tokens += 1 + + sentences = [s for s in sentences if s] + + nb_tokens = round(500 * 0.8) + nb_chunks = round(500 * 0.8 / 10) + + self.assertNotEqual( + sorted([True] * nb_chunks), + sorted([s[0]["token"][0].isupper() for s in sentences]), + "The first word of every sentence should not be capitalized. There is a very small chance that" + "this distribution happened. The test would fail in this case..." + ) + self.assertEqual( + round(0.3*nb_tokens), + [t["token"][0].isupper() for s in sentences for t in s].count(True), + "30% of tokens should be Capitalized" + ) + + self.assertEqual(tokens, 500 * 0.8, "There should be 80% of total tokens") + + def test_capitalized_and_indicator(self): + """Ensure that replacement of caps by lowercase letter + SPECIAL CHAR is done""" + conf = self._general_config_write(""" + + + + + + """) + + out, config = self._dispatch( + output_dir="./tests/tests_output/", + train=0.8, + dev=0.1, + test=0.1, + config=conf + ) + + tokens = 0 + sentences = [[]] + for line in self.read_file("train", "generic.tsv"): + if not line: + sentences.append([]) + else: + sentences[-1].append(line) + tokens += 1 + + sentences = [s for s in sentences if s] + + nb_tokens = round(500 * 0.8 * 0.5) + + self.assertEqual( + nb_tokens, + [t["token"][0].islower() and t["token"][1] == "🨁" for s in sentences for t in s].count(True), + "30% of tokens should be Capitalized" + ) + + self.assertEqual(tokens, 500 * 0.8, "There should be 80% of total tokens")