diff --git a/.gitignore b/.gitignore
index 2091994..d766a01 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ memory.csv
memory*.csv
new.yaml
tests/tests_output
+tests/test_config/generated.xml
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md
index fdee235..567ec12 100644
--- a/DOCUMENTATION.md
+++ b/DOCUMENTATION.md
@@ -361,3 +361,40 @@ It will produces the following output
The glue token is not applied on token, the lemma value is transfered to the previous row and the POS is lost.
`@glue_char` is used to concatenate columns such as `lemma` here,
+
+### Capitalization
+
+This post-processing function capitalizes (*ie.* makes the first letter of words upper-case) randomly or always first
+words of chunks (*ie.* sentences) and random letters inside. It also provides an uppercase mask creation, where it
+replaces uppercased letters with lowercase letters the [Neutral Chess Queen UTF-8 character](https://www.compart.com/fr/unicode/U+1FA01).
+
+The model is the following:
+
+```xml
+
+
+
+
+
+
+
+
+
+
+
+
+```
+
+1. column-token specifies the name of the column containing the raw form of the tokens
+2. (Optional) column-lemma does the same thing for lemma
+3. caps-to-utf8-marker activates masking uppercased letters.
+4. first-word is activated when when is set to a value between `always`, `random` and `ratio`.
+ 1. when="ratio" requires a second ratio value which needs to be a float between .0 and 1.0 (a percentage basically)
+ 2. when=random is basically a shortcut for the latter where ratio=0.5
+ 3. To identify sentences, you need to set up sentence-marker
+ 1. It can be name="empty_line", in which case chunks are separated by empty line (default output)
+ 2. It can be name="regexp", in which case it takes a `@matchPattern` attribute (for regular expression)
+ and a column that needs to be matched in `@source`, *.ie* ``
+5. first-letters works with the same when/ratio attribute than first-word. It applies said capitalization
+ to random words inside chunks.
+
\ No newline at end of file
diff --git a/protogenie/configs.py b/protogenie/configs.py
index 2e78bd9..44545e9 100644
--- a/protogenie/configs.py
+++ b/protogenie/configs.py
@@ -5,13 +5,13 @@
from .splitters import RegExpSplitter, LineSplitter, TokenWindowSplitter, FileSplitter, _SplitterPrototype
from .reader import Reader
-from .postprocessing import Disambiguation, ReplacementSet, Skip, PostProcessing, Clitic
+from .postprocessing import Disambiguation, ReplacementSet, Skip, PostProcessing, Clitic, Capitalize
from .toolbox import RomanNumeral
import datetime
from dataclasses import dataclass
Splitter = Type[_SplitterPrototype]
-PostProcessingClasses = [Disambiguation, ReplacementSet, Skip, RomanNumeral, Clitic]
+PostProcessingClasses = [Disambiguation, ReplacementSet, Skip, RomanNumeral, Clitic, Capitalize]
@dataclass
diff --git a/protogenie/postprocessing.py b/protogenie/postprocessing.py
index f967e63..fe09233 100644
--- a/protogenie/postprocessing.py
+++ b/protogenie/postprocessing.py
@@ -1,12 +1,18 @@
-if False:
- from .configs import CorpusConfiguration
import tempfile
-import regex as re
-from xml.etree.ElementTree import Element
-import csv
-from typing import List, ClassVar, Tuple, Dict
+import math
+import random
from abc import ABC, abstractmethod
from collections import namedtuple
+from xml.etree.ElementTree import Element
+import csv
+from typing import List, ClassVar, Tuple, Dict, Optional, TYPE_CHECKING, Union
+
+import regex as re
+
+if TYPE_CHECKING:
+ from .configs import CorpusConfiguration
+from .sentence_matchers import SentenceMatcherProto, SentenceRegexpMatcher
+Numeric = Union[int, float]
class PostProcessing(ABC):
@@ -26,6 +32,74 @@ def match_config_node(cls, node: Element) -> bool:
"""
return node.tag == cls.NodeName
+ def _modify_line(self, header: List[str], values: Optional[List[str]],
+ file_path: str, config: "CorpusConfiguration"):
+ raise NotImplementedError
+
+ def _stop_chunk(self, line: Optional[Dict[str, str]]) -> bool:
+ raise NotImplementedError
+
+ def _chunk_modify_routine(self, file_path: str, config: "CorpusConfiguration"):
+ raise NotImplementedError
+
+ def _scan_chunks(self,
+ file_path: str, config: "CorpusConfiguration",
+ sentence_matcher: Optional[SentenceMatcherProto]) -> Tuple[int, int]:
+ """ Analyzes the FILE for the number of chunks
+
+ """
+ chunks = 0
+ tokens = 0
+ with open(file_path) as file:
+ for nb_line, line in enumerate(file):
+ vals = line.strip().split(config.column_marker)
+
+ if nb_line == 0:
+ header = vals
+ continue
+
+ if len(header) == len(vals):
+ tokens += 1
+ if sentence_matcher and sentence_matcher.match(header, vals):
+ chunks += 1
+ elif sentence_matcher:
+ chunks += sentence_matcher.match(header, None)
+
+ return chunks, tokens
+
+ def _single_line_modify_routine(self, file_path: str, config: "CorpusConfiguration"):
+ header: List[str] = []
+ temp = tempfile.TemporaryFile(mode="w+") # 2
+
+ try:
+ with open(file_path) as file:
+ for nb_line, line in enumerate(file):
+
+ if not line.strip():
+ temp.write(line)
+ self._modify_line(header, None, file_path, config)
+ continue
+
+ vals = line.strip().split(config.column_marker)
+
+ if nb_line == 0:
+ header = vals
+ temp.write(line)
+ continue
+
+ modified = self._modify_line(header, vals, file_path=file_path, config=config)
+ temp.write(
+ config.column_marker.join(
+ [modified[head] for head in header]
+ ) + "\n"
+ )
+
+ with open(file_path, "w") as f:
+ temp.seek(0)
+ f.write(temp.read())
+ finally:
+ temp.close() # 5
+
class ApplyTo:
def __init__(self, source: str, target: List[str]):
@@ -62,7 +136,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"):
header: List[str] = []
for nb_line, line in enumerate(csv_reader): # The file should already have been open
if nb_line == 0:
- temp.write(config.column_marker.join(line+[self.disambiguation_key])+"\n")
+ temp.write(config.column_marker.join(line + [self.disambiguation_key]) + "\n")
header = line
continue
elif not line:
@@ -79,7 +153,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"):
lines[self.lemma_key] = self.match_pattern.sub("", lines[self.lemma_key])
else:
lines[self.disambiguation_key] = self.default_value
- temp.write(config.column_marker.join(list(lines.values()))+"\n")
+ temp.write(config.column_marker.join(list(lines.values())) + "\n")
with open(file_path, "w") as f:
temp.seek(0)
f.write(temp.read())
@@ -120,7 +194,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"):
header: List[str] = []
for nb_line, line in enumerate(csv_reader): # The file should already have been open
if nb_line == 0:
- temp.write(config.column_marker.join(line)+"\n")
+ temp.write(config.column_marker.join(line) + "\n")
header = line
continue
elif not line:
@@ -140,7 +214,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"):
else: # Otherwise, we just set the target value using this value
lines[target] = self.replacement_pattern
- temp.write(config.column_marker.join(list(lines.values()))+"\n")
+ temp.write(config.column_marker.join(list(lines.values())) + "\n")
with open(file_path, "w") as f:
temp.seek(0)
f.write(temp.read())
@@ -162,7 +236,7 @@ class Skip(PostProcessing):
NodeName = "skip"
def __init__(
- self, match_pattern: str, source: str
+ self, match_pattern: str, source: str
):
super(Skip, self).__init__()
self.match_pattern: re.Regex = re.compile(match_pattern)
@@ -177,7 +251,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"):
header: List[str] = []
for nb_line, line in enumerate(csv_reader): # The file should already have been open
if nb_line == 0:
- temp.write(config.column_marker.join(line)+"\n")
+ temp.write(config.column_marker.join(line) + "\n")
header = line
continue
elif not line:
@@ -190,7 +264,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"):
if self.match_pattern.search(lines[self.source]):
continue
- temp.write(config.column_marker.join(list(lines.values()))+"\n")
+ temp.write(config.column_marker.join(list(lines.values())) + "\n")
with open(file_path, "w") as f:
temp.seek(0)
@@ -214,7 +288,7 @@ class Clitic(PostProcessing):
Transfer = namedtuple("Transfer", ["col", "glue"])
def __init__(
- self, match_pattern: str, source: str, glue: str, transfers: List[Tuple[str, bool]]
+ self, match_pattern: str, source: str, glue: str, transfers: List[Tuple[str, bool]]
):
super(Clitic, self).__init__()
self.match_pattern: re.Regex = re.compile(match_pattern)
@@ -239,7 +313,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"):
modifications: List[Tuple[int, Dict[str, Tuple[str, str]]]] = []
for nb_line, line in enumerate(csv_reader): # The file should already have been open
if nb_line == 0:
- temp.write(config.column_marker.join(line)+"\n")
+ temp.write(config.column_marker.join(line) + "\n")
header = line
continue
elif not line:
@@ -253,7 +327,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"):
temp.write("\n".join([
config.column_marker.join(list(l.values()))
for l in sequence
- ])+"\n")
+ ]) + "\n")
sequence = []
modifications = []
continue
@@ -264,7 +338,7 @@ def apply(self, file_path: str, config: "CorpusConfiguration"):
if self.match_pattern.match(lines[self.source]):
modifications.append(
(
- len(sequence) - 1 -len(modifications),
+ len(sequence) - 1 - len(modifications),
{key: (keep, lines[key]) for (key, keep) in self.transfers}
)
)
@@ -293,3 +367,129 @@ def from_xml(cls, node: Element) -> "Clitic":
for tr in node.findall("transfer")
]
)
+
+
+class Capitalize(PostProcessing):
+ """ Applies capitalization strategies to content
+ """
+ NodeName = "capitalize"
+ Marker: str = "π¨" # NEUTRAL CHESS QUEEN
+ RE_Upper: re.Regex = re.compile("(\p{Lu})")
+
+ def __init__(self, first_word: Numeric, first_letters: Numeric,
+ column_token: str,
+ column_lemma: Optional[str] = None,
+ apply_unicode_marker: bool = False,
+ sentence_matcher: Optional[SentenceMatcherProto] = None):
+
+ self.first_word: Numeric = first_word
+ self.first_letters: Numeric = first_letters
+
+ self.column_token: str = column_token
+ self.column_lemma: Optional[str] = column_lemma
+ self.apply_unicode_marker: bool = apply_unicode_marker
+ self.sentence_matcher: Optional[SentenceMatcherProto] = sentence_matcher
+ self.first_word_state: bool = True # Variable representing the current status
+ # (True = next word is a first word)
+
+ self._files_chunks: Dict[str, List[bool]] = {}
+ self._files_tokens: Dict[str, List[bool]] = {}
+
+ @staticmethod
+ def parse_when(value: str, ratio: Optional[str]) -> Numeric:
+ if value == "always":
+ return 1
+ elif value == "never":
+ return 0
+ elif value == "random":
+ return 0.5
+ elif ratio:
+ try:
+ if 1.0 > float(ratio) > .0:
+ return float(ratio)
+ except:
+ raise ValueError("Your ration value is probably wrong. They must be < 1.0 (Found: {})".format(ratio))
+ raise ValueError("Invalid parameters for a ratio or an application")
+
+ @classmethod
+ def parse_node_including_when(cls, node: Element, name: str) -> Tuple[Numeric, Optional[Element]]:
+ target = node.findall("./{name}".format(name=name))
+ if target:
+ return cls.parse_when(target[0].attrib["when"], target[0].attrib.get("ratio")), target[0]
+ return 0, None
+
+ def _modify_line(self, header: List[str], values: Optional[List[str]],
+ file_path: str, config: "CorpusConfiguration") -> Dict[str, str]:
+ if self.first_word and self.sentence_matcher.match(header, values):
+ self.first_word_state = True
+ return dict(zip(header, values))
+
+ if not values or len(header) != len(values):
+ return {}
+
+ line = dict(zip(header, values))
+
+ # Sentence starts
+ if self.first_word > .0 and self.first_word_state and self._files_chunks[file_path].pop():
+ line[self.column_token] = line[self.column_token].capitalize()
+ # Need to pop tokens as well
+ if self.first_letters:
+ self._files_tokens[file_path].pop()
+ elif self.first_letters > .0 and self._files_tokens[file_path].pop():
+ line[self.column_token] = line[self.column_token].capitalize()
+
+ line[self.column_token] = self.RE_Upper.sub(self._replace_caps, line[self.column_token])
+
+ self.first_word_state = False
+ return line
+
+ def _replace_caps(self, value):
+ return value.group().lower()+self.Marker
+
+ @staticmethod
+ def _transform_to_bool_list(count: int, ratio: Numeric) -> List[bool]:
+ if ratio == 1.0:
+ return [True] * count
+ elif ratio == .0:
+ return [False] * count
+ else:
+ positives = min(round(count * ratio), count)
+ negatives = count - positives
+ out = [True] * positives + [False] * negatives
+ random.shuffle(out)
+ return out
+
+ def apply(self, file_path: str, config: "CorpusConfiguration"):
+ # We scan the files
+ chunks, tokens = self._scan_chunks(file_path, config, sentence_matcher=self.sentence_matcher)
+
+ # We store the dispatch of booleans
+ if self.first_word > .0:
+ self._files_chunks[file_path] = self._transform_to_bool_list(chunks, self.first_word)
+ if self.first_letters > .0:
+ self._files_tokens[file_path] = self._transform_to_bool_list(tokens, self.first_letters)
+
+ self._single_line_modify_routine(file_path=file_path, config=config)
+
+ @classmethod
+ def from_xml(cls, node: Element) -> "Capitalize":
+ first_word, first_word_elem = cls.parse_node_including_when(node, "first-word")
+ first_letters, _ = cls.parse_node_including_when(node, "first-letters")
+ sentence_marker = None
+ if first_word != .0:
+ try:
+ sentence_marker = SentenceMatcherProto.from_xml(
+ first_word_elem.findall("./sentence-marker")[0]
+ )
+ except IndexError:
+ print("You forgot to use a sentence marker.")
+ raise Exception
+
+ return cls(
+ first_word=first_word,
+ first_letters=first_letters,
+ sentence_matcher=sentence_marker,
+ apply_unicode_marker=node.attrib.get("utf8-marker-for-caps", "true").lower() == "true",
+ column_token=node.attrib["column-token"],
+ column_lemma=node.attrib.get("column-lemma")
+ )
diff --git a/protogenie/schema.rng b/protogenie/schema.rng
index 4b55717..de0ad78 100644
--- a/protogenie/schema.rng
+++ b/protogenie/schema.rng
@@ -69,9 +69,27 @@
+
+
+
+
+
+
+
+
+
+ NFD
+ NFKD
+ NFC
+ NFKC
+
+
+
+
+
Deals with enclitics
@@ -102,6 +120,65 @@
+
+
+
+
+ always
+ never
+ random
+
+
+
+
+
+ ratio
+
+
+
+ Needs to be smaller or equal to 1
+
+
+
+
+
+
+ A node for setting sentence recognition
+
+
+
+
+ empty_line
+
+
+
+
+
+
+ Define capitalization strategies
+
+
+
+
+
+ Convert capitals to lower-case + UTF8 marker. It inserts the π¨ character after capitals (NEUTRAL CHESS QUEEN)
+
+
+
+
+
+
+ Capitalize first word
+
+
+
+
+ Capitalize first letter
+
+
+
+
+
Functions pre-added to Protegeneia
@@ -273,6 +350,16 @@
+
+ Regexp choice that applies a matchPattern on given column
+
+
+ regexp
+
+
+
+
+
@@ -287,11 +374,7 @@
-
- regexp
-
-
-
+
Each empty line represent a new passage
diff --git a/protogenie/sentence_matchers.py b/protogenie/sentence_matchers.py
new file mode 100644
index 0000000..9cbd998
--- /dev/null
+++ b/protogenie/sentence_matchers.py
@@ -0,0 +1,43 @@
+__all__ = ["SentenceMatcherProto", "SentenceRegexpMatcher", "SentenceEmptyLineMatcher"]
+
+from abc import ABC
+from xml.etree.ElementTree import Element
+from typing import Optional, List
+
+import regex as re
+
+
+class SentenceMatcherProto:
+ def match(self, headers: List[str], values: Optional[List[str]]):
+ raise NotImplementedError()
+
+ @classmethod
+ def from_xml(cls, node: Element) -> "SentenceMatcherProto":
+ if node.attrib["name"] == "regexp":
+ return SentenceRegexpMatcher.from_xml(node.xpath("./sentence-marker")[0])
+ return SentenceEmptyLineMatcher()
+
+
+class SentenceRegexpMatcher(SentenceMatcherProto):
+ def __init__(self, regexp: re.Regex, column: str):
+ self.regexp: re.Regex = regexp
+ self.column: str = column
+
+ def match(self, headers: List[str], values: Optional[List[str]]):
+ data = dict(zip(headers, values))
+ return self.regexp.match(data[self.column]) is not None
+
+ @classmethod
+ def from_xml(cls, node: Element) -> "SentenceRegexpMatcher":
+ return cls(
+ regexp=re.compile(node.attrib["regexp"]),
+ column=node.attrib["column"]
+ )
+
+
+class SentenceEmptyLineMatcher(SentenceMatcherProto):
+ def __init__(self):
+ pass
+
+ def match(self, headers: List[str], values: Optional[List[str]]):
+ return not values or len(headers) != len(values)
diff --git a/tests/test_config/capitalize.xml b/tests/test_config/capitalize.xml
new file mode 100644
index 0000000..5955ca6
--- /dev/null
+++ b/tests/test_config/capitalize.xml
@@ -0,0 +1,32 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/test_data/capitalize.tsv b/tests/test_data/capitalize.tsv
new file mode 100644
index 0000000..786b6ff
--- /dev/null
+++ b/tests/test_data/capitalize.tsv
@@ -0,0 +1,550 @@
+lem_gfdqnu pos_nwuzjq tok_pdsbeb
+lem_jtuhwc pos_qkvpdq tok_ebxvhz
+lem_guxqgv pos_ehupak tok_jdswrg
+lem_sjggqt pos_htayhs tok_oyrpif
+lem_vhaocb pos_bnvexi tok_uwqjtj
+lem_wljofl pos_uwlevn tok_xijnla
+lem_pyzxcr pos_wcngiu tok_qqqzbb
+lem_vmhdhe pos_injexe tok_kjthhu
+lem_mzipht pos_wihweo tok_wmhaey
+lem_uvxkzg pos_pufjla tok_ixrckf
+
+lem_jtphzk pos_pmcitq tok_dnkwur
+lem_dzzgwa pos_gnginp tok_smqdlt
+lem_dfvmoh pos_thsqcw tok_czqghu
+lem_ylldjx pos_vpnzdz tok_mgqdog
+lem_gdcymo pos_qizuyj tok_mpoyge
+lem_blswmx pos_ofewxd tok_yrhazm
+lem_vbmwsn pos_pcbrqn tok_hyndvw
+lem_rsigog pos_hsbjaa tok_bncvto
+lem_uizcsj pos_gxyohx tok_ckqjwy
+lem_anreid pos_xofyih tok_ivekmd
+
+lem_mfjurv pos_iuvqwl tok_rtlxxu
+lem_udnmaj pos_kxiyzl tok_wuzxxc
+lem_clmsdl pos_rqplkq tok_welcvi
+lem_vyseky pos_dcpbem tok_heyjym
+lem_hxbufp pos_wxpsqy tok_xurumg
+lem_zbwpxc pos_bmluia tok_ykqaom
+lem_gfeafu pos_xhjhwu tok_hpbiur
+lem_ommham pos_gzvjcb tok_boehfy
+lem_lxvuzv pos_foadfp tok_xpfyxf
+lem_eckpfg pos_rlttsv tok_yyauxh
+
+lem_ehejdy pos_fkefzu tok_cowfrs
+lem_wzndfl pos_fwxmlr tok_fxjlax
+lem_uirpeg pos_cubpto tok_gxppbk
+lem_ldmmwy pos_mzfrsw tok_kceqfn
+lem_oeehst pos_pzjxga tok_xcrmgg
+lem_waehpw pos_hqwknv tok_iuhaqi
+lem_yqmcym pos_welheb tok_ozaipr
+lem_uizoap pos_wtsecz tok_xeklhd
+lem_elqjaf pos_xizycu tok_yfwiys
+lem_oxbpma pos_hvejep tok_xgmsfz
+
+lem_kvonsd pos_acwxgm tok_ssbbsk
+lem_drxcvj pos_oxxoih tok_hqwmpn
+lem_frvtls pos_yuvdbn tok_nvafrq
+lem_ihzvwr pos_huxeyz tok_uavkty
+lem_pukxrr pos_ngcjfm tok_kiezrp
+lem_wjmgcn pos_nyvleq tok_wpdkuk
+lem_ixakwb pos_qedopt tok_hqmzxa
+lem_fyjaic pos_velfce tok_ljcyrl
+lem_ithvcs pos_qyjmzo tok_lgpfek
+lem_olqbjx pos_hhaooc tok_ixqaah
+
+lem_avfwrs pos_jecmuf tok_ezidmw
+lem_zlaiff pos_wvbwgv tok_hpurdm
+lem_wyqkuu pos_bwxveg tok_txoyqf
+lem_zropgk pos_yrmmgv tok_hwhagm
+lem_uueify pos_nnvfxt tok_zgzpjp
+lem_fpwefq pos_uolyfd tok_bdoanb
+lem_skyiuu pos_upgzkh tok_pmyggs
+lem_ohjrbd pos_snnezp tok_rczdgg
+lem_xkccmp pos_oefqfz tok_paxfjn
+lem_njkemd pos_fslnyu tok_ykhung
+
+lem_qiozfx pos_aztlxz tok_zigmkf
+lem_djxxdu pos_imssir tok_oaqwmu
+lem_mvbuiq pos_wnltzf tok_apynxb
+lem_zinbsr pos_guiiql tok_itjxly
+lem_pppsal pos_fgmnxp tok_eouvcv
+lem_zlwwzn pos_sbrkip tok_yairqa
+lem_kxafmo pos_tahrqi tok_haamwe
+lem_kuqyxs pos_nnxnis tok_joavzm
+lem_bfdfkr pos_dkjomn tok_wmppol
+lem_ozcdjl pos_livcam tok_jolxma
+
+lem_gcqjba pos_oexwyf tok_doqyhm
+lem_fntvul pos_aaquyc tok_xwrhvp
+lem_umdplq pos_ktbxir tok_ekqehw
+lem_jlwmol pos_fwlyvv tok_twzgrz
+lem_uxcskg pos_fpktiw tok_piuzzu
+lem_hsexdj pos_iwfxgq tok_jvlwgf
+lem_atoxnh pos_hbcazn tok_jsqqxs
+lem_annswt pos_dzaaev tok_ajctrg
+lem_ifofsa pos_mudsye tok_urkpiu
+lem_xikciw pos_syvqqc tok_etbvum
+
+lem_prsbxb pos_ynrcao tok_zddvgs
+lem_onskyy pos_lfzsnd tok_wvgtey
+lem_xuxtls pos_tkedxb tok_hgrctv
+lem_tpmfdm pos_lipmdq tok_xbgrey
+lem_ktqkgr pos_nxluxn tok_bithif
+lem_zkhxhp pos_mbsoji tok_amddgu
+lem_kyfaws pos_eaftpm tok_bhwvlh
+lem_wzyjdt pos_anpbgf tok_oynffv
+lem_ydgmla pos_xgdoql tok_nnwcrg
+lem_hmmkut pos_djuwct tok_qqjmhm
+
+lem_fjgkar pos_tqhsjt tok_iikqst
+lem_wuwaqx pos_rhaxfn tok_ebpybx
+lem_vijhlp pos_ezddcb tok_exkcff
+lem_wpmesr pos_wyzeof tok_ouujmp
+lem_rlsmrl pos_hrlhmg tok_bsmjps
+lem_bawppm pos_tppuyt tok_ptdpzk
+lem_ffvgll pos_qoyslz tok_evadlh
+lem_xjhrsf pos_mmtjot tok_efiswk
+lem_mznezl pos_ygixbi tok_epdico
+lem_mwjnak pos_rqjenm tok_aaghdw
+
+lem_cfugrn pos_thkymx tok_gyagep
+lem_cravnv pos_hixfyo tok_bhanmo
+lem_rdplck pos_lmifvp tok_ngmgfa
+lem_qtuhak pos_zchxwy tok_dhtcal
+lem_qeqjwv pos_dkmfbt tok_rdfsym
+lem_pjkmak pos_tftbhs tok_neukiv
+lem_pcetew pos_pvulyn tok_hqfkyn
+lem_ajoxth pos_zruhqw tok_ebsfpd
+lem_osvhzh pos_scckey tok_dishno
+lem_hyursx pos_rshgty tok_ysxzwq
+
+lem_lqpirp pos_jbpolq tok_vazafx
+lem_rvdeuo pos_dzmccw tok_gegpks
+lem_jwqffg pos_lnxodl tok_jgxgpz
+lem_rjwuwy pos_cadats tok_yeqiow
+lem_ueesqu pos_nwiiwv tok_pohhea
+lem_emfuar pos_jvirac tok_abqqji
+lem_ogfjjh pos_szsuza tok_tabgvm
+lem_pndamg pos_rwgldl tok_tqudai
+lem_iywpiq pos_djndzr tok_hxjklu
+lem_vqcqqo pos_onxyxw tok_zugext
+
+lem_ajemad pos_abjkzf tok_rjevuf
+lem_bxqhej pos_rtzgmx tok_irrktm
+lem_druavn pos_nujegv tok_ectxkw
+lem_lyasdg pos_mnutvg tok_fwvvrc
+lem_sfznmk pos_lfxwgr tok_szaavf
+lem_sbsjdd pos_dozpjr tok_zbzknd
+lem_kvgles pos_fhogxy tok_taccsm
+lem_fcqtpx pos_tmgcrb tok_qbkikd
+lem_psvbfk pos_jqyskj tok_rduell
+lem_fbgnsk pos_itvrlr tok_urfspk
+
+lem_zlnksr pos_qniadq tok_mnhifo
+lem_gewmtg pos_iooisd tok_mkedwh
+lem_vppxcu pos_mfhaax tok_cqjpij
+lem_gvifwv pos_xkxkfm tok_hvbudb
+lem_sladpx pos_qshzaw tok_ydnmrs
+lem_bnopal pos_rxxfww tok_gmbium
+lem_donmkk pos_vewkdf tok_nynzer
+lem_yeemuc pos_nnurus tok_edvwtu
+lem_xgvluo pos_mmkxck tok_ydmfge
+lem_jutsbb pos_sqftce tok_vzawpp
+
+lem_jhqoqs pos_ihmyxb tok_yfyrea
+lem_nkvlnx pos_aqvhio tok_nnelpw
+lem_gxkkoc pos_daltlm tok_jorbqw
+lem_ulrtdw pos_srnqjs tok_zfvikl
+lem_uabzia pos_toagij tok_qxtcpy
+lem_tzwyub pos_dtjufi tok_ixvpvt
+lem_fusqdv pos_jtqthi tok_hzukad
+lem_ndyycw pos_zvyagi tok_dpnsss
+lem_fgoitw pos_wmaosb tok_sfkntx
+lem_ujsciq pos_xshrpc tok_ebzffk
+
+lem_xutjil pos_adclgo tok_upezoo
+lem_ixaanl pos_llfvjy tok_dtsfgg
+lem_phwitw pos_pgioae tok_cyjwtq
+lem_qmcwig pos_ecytfd tok_ufagep
+lem_bbflme pos_iqjpts tok_czdsri
+lem_deneph pos_igdlib tok_bmaizm
+lem_ocnfid pos_saklbk tok_xokkhl
+lem_geiuof pos_wstmda tok_nlcyec
+lem_iwuxxw pos_dgzjoy tok_cdbnnm
+lem_nvxuvz pos_uhodcc tok_dnvvwg
+
+lem_cqnsbo pos_pwadgr tok_kortuj
+lem_anvqxk pos_yccvjf tok_ampwll
+lem_nwvyas pos_tbydky tok_mrcayc
+lem_cyptsq pos_iphsup tok_kztqgr
+lem_fxsbex pos_npemmd tok_davhpt
+lem_aekkky pos_hguizw tok_jpklwv
+lem_nlprwl pos_pvinyd tok_gmdagq
+lem_xxobjw pos_nkrsfd tok_bujtdl
+lem_zxokri pos_uirync tok_rhnaim
+lem_ehavud pos_unssnr tok_azrcug
+
+lem_laagva pos_wbmkga tok_plrqcy
+lem_tnabku pos_tiindw tok_ogxljr
+lem_cmbgsk pos_cdywvr tok_xcbwrw
+lem_hnnnij pos_qrfymf tok_crwhuq
+lem_iqzezq pos_ykndtn tok_irqwns
+lem_skzyai pos_mxlyrs tok_wlurhe
+lem_fnxwvk pos_bnnijw tok_bkkmji
+lem_svgooe pos_krgptc tok_jrqtry
+lem_assxwo pos_omdyzx tok_sgvsxv
+lem_hvdgwz pos_vrgeow tok_tonrvq
+
+lem_ticbwy pos_snivqr tok_lyuygc
+lem_dsmjsj pos_rbhzpc tok_wxixkc
+lem_skclnj pos_wibgje tok_njlufj
+lem_atelfh pos_gywost tok_jpbieo
+lem_idetrp pos_mhvsob tok_abulej
+lem_tcalax pos_nedduk tok_paoezt
+lem_bncazo pos_amfnxh tok_jxdmpq
+lem_nwdaon pos_zvwvdk tok_jtzspc
+lem_kbfksu pos_gzgdoq tok_pkvgoy
+lem_gmzvkk pos_irzvnj tok_cnseuw
+
+lem_tzlsch pos_bkjvqo tok_qqfszl
+lem_fytfer pos_jlyhci tok_hljuwb
+lem_fqzmxm pos_ucdwkn tok_bltjkt
+lem_zlavqs pos_pnauav tok_fqliys
+lem_upyjpk pos_ljgesp tok_ebbmgl
+lem_goqobs pos_bmuenn tok_fncglz
+lem_hudmem pos_kpbjpq tok_vimhsd
+lem_fzifbh pos_aipmub tok_ntxgsc
+lem_krzzbl pos_scfjmu tok_pbiemy
+lem_ptegqc pos_doglkq tok_ipmpiq
+
+lem_klmyfi pos_occvyo tok_tshwre
+lem_hikvdz pos_edbhdc tok_nnldcw
+lem_ypdhrw pos_mtjrcc tok_mbcqcg
+lem_fqksnl pos_jwfxnp tok_cypayv
+lem_evcfsi pos_dttldp tok_webnbk
+lem_ibyzvt pos_qewpgj tok_yrjlhg
+lem_qkgrvm pos_ktptyv tok_kadqgf
+lem_nxcqvc pos_wdlzbr tok_froxou
+lem_nhvqen pos_kkfevq tok_survdh
+lem_moouml pos_wzpyxo tok_bpqtpm
+
+lem_ojplcl pos_zyszct tok_zgmelo
+lem_hgqjnp pos_ouflgq tok_fxjxpi
+lem_djfjop pos_ziljar tok_xmpmcq
+lem_jeoykt pos_xksutm tok_jfoyoq
+lem_hequrp pos_objiod tok_xpcixd
+lem_bjodxt pos_ccrnem tok_oqibha
+lem_wbrifc pos_jlmhnq tok_xgyutd
+lem_rdgoyk pos_cleaau tok_htrzdi
+lem_ukwxbt pos_oxixux tok_fjwoar
+lem_bffybk pos_qgiflp tok_orilxp
+
+lem_dzfpzx pos_mfzxlp tok_pehmff
+lem_hfqgig pos_nkjvhi tok_ouofwp
+lem_ssavei pos_yosrny tok_ptcybg
+lem_peahtp pos_mzlfjg tok_pyiaej
+lem_bthzlb pos_mtfqms tok_edqcrm
+lem_tifmlv pos_svkguv tok_wnllfv
+lem_iajvko pos_qihveq tok_zoeylb
+lem_rnmfta pos_jqhmhr tok_qnfefw
+lem_zbvoet pos_pyqtzr tok_jvvlfu
+lem_vxiant pos_uollaq tok_hmlkab
+
+lem_pvzaap pos_kwjlav tok_hloelw
+lem_ineheu pos_chtsfm tok_oriwno
+lem_mhwkcz pos_wbjycg tok_ajjtyn
+lem_dhkidv pos_mnrlqg tok_dydyzf
+lem_uzerdx pos_uwtnpj tok_nfrasy
+lem_kimtmg pos_dosyzo tok_ilmbes
+lem_kfgynq pos_vrpcgt tok_hlybfg
+lem_pnvdmh pos_fqinev tok_xfooor
+lem_jsjzaj pos_risrfh tok_sdqucs
+lem_jlaejs pos_psudfw tok_hycysr
+
+lem_qgqsct pos_ksspir tok_csbvjf
+lem_rmnpfz pos_dyvrkq tok_vocljk
+lem_qmappz pos_uhfxyg tok_wrldjy
+lem_fdydmo pos_duwjng tok_esvrln
+lem_tyifjh pos_ywyzqq tok_nnpkto
+lem_rivwnz pos_ublpfh tok_vfvtpx
+lem_qhxgdc pos_oewtgi tok_fugmmi
+lem_ftfsgs pos_tfirgw tok_knunat
+lem_chixfg pos_negxro tok_rdcjdx
+lem_uzqrej pos_uvposr tok_uohnip
+
+lem_jxemra pos_rkbrbk tok_ifhxce
+lem_olzhtv pos_jfrywh tok_yhvihd
+lem_ydgohk pos_hszrqx tok_tlarpc
+lem_zqglyf pos_ecedbz tok_lemvxl
+lem_ypjaxo pos_zfqntt tok_rqbccq
+lem_swxsyq pos_wlgmcg tok_gdkilr
+lem_arpdhf pos_ghimin tok_logvyv
+lem_xmzdmn pos_dmfrit tok_edstut
+lem_uorbwq pos_sftqsm tok_sanfoa
+lem_pczgzy pos_qoezvl tok_zaptlx
+
+lem_lfxiwb pos_djgaer tok_oivhxm
+lem_gkojxc pos_emybwc tok_yvptil
+lem_jkaufn pos_ohlrgi tok_izmquh
+lem_bbwrkq pos_glsqyo tok_iuzqwl
+lem_dnpbzc pos_qxptxb tok_qwyfem
+lem_lewjlv pos_ywqhum tok_ixjyol
+lem_mqqfli pos_wqsuny tok_etheob
+lem_bmlwcn pos_gaxxdc tok_gxhrkp
+lem_ogdatb pos_povwgi tok_bdhwqb
+lem_ywllkd pos_reegdz tok_bhhlgo
+
+lem_nlllvc pos_ifwccy tok_hpofjd
+lem_rvbtak pos_zkvflq tok_szpltj
+lem_jxgesb pos_fiytll tok_mbrmdl
+lem_otawdu pos_kjdjbp tok_pykwfn
+lem_ijfzdv pos_ghayen tok_bwnkny
+lem_zpvgcp pos_nlkqey tok_nwstwc
+lem_snrvbm pos_yssbop tok_khlcdg
+lem_tiesze pos_mtvjen tok_sdvzuv
+lem_ctsgno pos_dawnws tok_pofuep
+lem_hjnwef pos_fprfsp tok_lsjguh
+
+lem_fxbmid pos_cobxim tok_mekzbl
+lem_ovmyis pos_mpqdzw tok_nijxgd
+lem_ooscrj pos_bhnasz tok_efzveb
+lem_jjrcaq pos_yrjlnz tok_pxoyxj
+lem_zruscz pos_jqlsfz tok_xgkxog
+lem_yetvik pos_havmzy tok_hpnpph
+lem_viyswd pos_kjsboc tok_frehzz
+lem_njrseq pos_nizvov tok_zwqtaw
+lem_cyzlqz pos_iuvhcf tok_byybvp
+lem_hatdhz pos_jxrypo tok_dcszlg
+
+lem_dzqrub pos_xcggki tok_qxdmyl
+lem_jgsfzd pos_lcyjne tok_ymorme
+lem_zrahry pos_ykjkgs tok_iuyyoz
+lem_ujqjpc pos_mwkvez tok_zavudf
+lem_ylyero pos_xkqvdv tok_kueygb
+lem_akrtkf pos_upphpq tok_lozazk
+lem_tynjxv pos_hnfisf tok_nlfpod
+lem_aynhvz pos_jhfdez tok_vjwgrx
+lem_dvrvoz pos_ghbton tok_eijeqe
+lem_rwnckj pos_qxtbrp tok_hnymqy
+
+lem_yeezsf pos_edgtfl tok_huepha
+lem_fnbmwa pos_xexppm tok_jnbhtt
+lem_cdvath pos_neusbc tok_goeiev
+lem_gzxbkl pos_wkwjrs tok_nrcybz
+lem_aalrba pos_bziloj tok_vmhqob
+lem_ubyqps pos_wgtscc tok_djigqo
+lem_etcvry pos_viqkgx tok_ccwsez
+lem_rwmldm pos_gpqhwa tok_fdodho
+lem_chcwrl pos_srwglr tok_rvdfge
+lem_hevyyu pos_kmrwse tok_jqykzl
+
+lem_dgrwgx pos_ekpepq tok_nbaaxm
+lem_quehtr pos_pbtwtz tok_wtaeak
+lem_xjacfo pos_jvycod tok_yklfvp
+lem_tqpycj pos_ehfdln tok_exkbvj
+lem_rqzwuj pos_cgcqdj tok_syzzow
+lem_hdxrwc pos_nbhrgt tok_sxvjlo
+lem_sfvquy pos_bevjtv tok_zetzen
+lem_oexsez pos_sbasan tok_mdcsdi
+lem_lnszds pos_hhfxuc tok_vlgtji
+lem_miujhn pos_frqxep tok_gedpqp
+
+lem_saahbi pos_wxfkrp tok_tegsiq
+lem_zzqnzm pos_vdemgo tok_prrcap
+lem_aemczs pos_fhaeff tok_txgkqe
+lem_kwhznc pos_syakvr tok_kogsme
+lem_vwkozu pos_rijwtb tok_usowkw
+lem_fnzqji pos_katfwf tok_kiuttf
+lem_msucrn pos_lzhcej tok_jeiecc
+lem_iheaua pos_retrne tok_iukohv
+lem_hjrleb pos_eppbih tok_owrxoo
+lem_juavaj pos_ukicbx tok_ycgtfl
+
+lem_prtjto pos_bzegei tok_uyblvv
+lem_scseuz pos_titmtm tok_lhmzbp
+lem_oixgqw pos_yyfmko tok_jhojkh
+lem_uyeqjr pos_hljdgn tok_mljgaj
+lem_ddxibu pos_akoluc tok_bovioq
+lem_cehnmv pos_qwdqtn tok_dtvkvt
+lem_xuxfji pos_zlsndz tok_qiogqv
+lem_jnytve pos_gfkxlt tok_inlzat
+lem_zfhbxe pos_xobyju tok_lcbacs
+lem_sjlbas pos_ttfjul tok_sqkwlg
+
+lem_awjirn pos_ouhwax tok_mqsghi
+lem_ofeegs pos_smxjgx tok_rfolii
+lem_atswby pos_fenkvc tok_gqwikq
+lem_pgxdiw pos_dsrwjc tok_tflanb
+lem_qzhmel pos_rhztps tok_qluigd
+lem_bvdbmp pos_zncjcd tok_vsuewx
+lem_sgxcxc pos_epblbc tok_dcmpck
+lem_xdvgiz pos_doaoyj tok_mpuqew
+lem_odyrfg pos_nhhexe tok_pdzelw
+lem_ylzdgt pos_csyilk tok_bppluv
+
+lem_ldleif pos_nqrjxg tok_llyips
+lem_kymady pos_kcjbvp tok_xrcbnt
+lem_kklwpq pos_wfievh tok_nyxktk
+lem_yioeks pos_syomgn tok_nsymqf
+lem_uopydm pos_ptjzva tok_giubnu
+lem_hhyqgy pos_gvwdcy tok_kvkuks
+lem_qrztso pos_clcsna tok_vaewey
+lem_nhhwzj pos_ezgloi tok_eblrbl
+lem_yyjamz pos_xercqz tok_jnosbg
+lem_cojxgh pos_zingfl tok_wmpjhu
+
+lem_luxbuk pos_cxonar tok_cwplgr
+lem_qatioz pos_dylzyf tok_pcapsv
+lem_dzufzk pos_ytspme tok_gxditb
+lem_yubxcy pos_uzwutg tok_sjajgt
+lem_gcozwx pos_beyboh tok_kktcrs
+lem_hanobv pos_qnsyiv tok_bsfxkz
+lem_elkdde pos_wzrtnd tok_hoxyrh
+lem_jzdimc pos_zkijnh tok_zlvbkr
+lem_hmmmlv pos_lnlfea tok_jjbaaz
+lem_tzqmkq pos_vjhhxt tok_ruslel
+
+lem_mjqeom pos_gebztq tok_aerazk
+lem_upfusg pos_uymcnl tok_czdbnp
+lem_hllsio pos_adizyc tok_qphapl
+lem_awlgsj pos_fszhpv tok_sxsdzv
+lem_xeuaam pos_ddjbrb tok_luddyr
+lem_tpybvy pos_bbopwa tok_edwrrd
+lem_vgoehd pos_xuypjw tok_efglsk
+lem_zzfczt pos_vjatsq tok_emycex
+lem_szcxpt pos_nojbqn tok_edhumk
+lem_ispltv pos_tsgpnu tok_rpcvjq
+
+lem_mbkvhi pos_smppuu tok_pxndhj
+lem_meivyh pos_gptann tok_sdjffp
+lem_nxascw pos_thoscw tok_pzwxko
+lem_xphiue pos_mubpmk tok_ddwxhy
+lem_vkrwdz pos_eehypc tok_duejpg
+lem_zjfyxc pos_zxizul tok_hdcxcl
+lem_mulutt pos_oqcxhb tok_htymmg
+lem_xjnxbv pos_hvvefp tok_waszok
+lem_iiuvxl pos_ugokos tok_hajzte
+lem_slueez pos_doxobb tok_scxheq
+
+lem_oqipqu pos_wirxjd tok_xoddxt
+lem_wynqzb pos_gzuzum tok_yawpve
+lem_lwighy pos_ghtfcr tok_hudyiu
+lem_jrycuz pos_nlfuwx tok_mdjomx
+lem_nzhlcs pos_vnbobw tok_lsiqpr
+lem_djmiqw pos_dqnjft tok_ycwdbr
+lem_qbiifh pos_zsejfw tok_iohtrz
+lem_gquaos pos_vrrfeh tok_goglgo
+lem_sljycv pos_ugcecq tok_mooblo
+lem_ukztbi pos_fvqrwt tok_fvekhb
+
+lem_xyjecp pos_bngwhd tok_aahvsy
+lem_gqtips pos_ujjrcw tok_aeeagn
+lem_uzmhzh pos_sdrpev tok_yasdcw
+lem_aifxsa pos_pgzaro tok_dvrxuz
+lem_kdgtlg pos_nmbpkv tok_qcmfhq
+lem_akitwl pos_iqemah tok_rupfks
+lem_zrnmed pos_jrztme tok_wjvtlr
+lem_vrbkpt pos_zrdrat tok_lyqree
+lem_prncga pos_gprvez tok_hurmbi
+lem_dlmgek pos_vkyrco tok_cxrptf
+
+lem_bxskvg pos_zmnhqk tok_tzdike
+lem_lsnlte pos_coqune tok_zlbmnz
+lem_wtdaxy pos_cmlzid tok_epvwwo
+lem_qnvbib pos_qgrkky tok_fzesoz
+lem_ccivwi pos_pmhcrq tok_wnrebr
+lem_pykjmx pos_avqjes tok_qugjqm
+lem_illjpl pos_ghcuqm tok_swogfp
+lem_tdrfaa pos_sflkoy tok_kjkztg
+lem_qwrgrd pos_nolwqs tok_jiihqr
+lem_gfccmy pos_vuyicj tok_ocncea
+
+lem_tjgrnl pos_nqepwm tok_rksrmn
+lem_jmtheh pos_ajlnmn tok_vhmszx
+lem_byazwu pos_cduxqf tok_uwrpcg
+lem_byikfk pos_gvgrag tok_juefzj
+lem_kgrpzq pos_ahluev tok_stlvkd
+lem_nyukqs pos_qyzolu tok_tstswn
+lem_jomvhs pos_cdcarb tok_fxqfot
+lem_unsenp pos_bflezv tok_wwyjkc
+lem_zimsxz pos_xurwzl tok_rrjmkb
+lem_qvjald pos_qdbrdt tok_cxzvzm
+
+lem_ewxsgx pos_ajgckx tok_beritv
+lem_mpxaid pos_ylzbfl tok_aodlve
+lem_fujsxz pos_ahphjs tok_bodsvu
+lem_lifzal pos_nkzgmd tok_wqfhzu
+lem_izvkug pos_cpczbr tok_eibvst
+lem_ohggnv pos_ojhwiw tok_puhifc
+lem_cyygzr pos_yumcfe tok_kdxdam
+lem_gtbluo pos_qbvlct tok_djmsjc
+lem_amjqcn pos_qsexly tok_rljvrs
+lem_vshriq pos_kawwnk tok_sivtbf
+
+lem_dfqhkf pos_qjjksy tok_dcjjtj
+lem_ivdeai pos_ablhhd tok_lbnksw
+lem_eykqmj pos_achgqb tok_errdti
+lem_cgfxgj pos_yeicru tok_gmcttt
+lem_hfchhv pos_bbmgsg tok_ihxtuc
+lem_pwcjmf pos_cktnuz tok_sgzbur
+lem_vjrcwc pos_eliuud tok_opeihi
+lem_vfeury pos_oabjwv tok_vtxnuk
+lem_ojptmn pos_nmcckm tok_ieeqii
+lem_zityox pos_plcaie tok_hjhqle
+
+lem_eicsff pos_uysimb tok_rhbfqq
+lem_swawvt pos_zwcgjj tok_mpfzmc
+lem_hxdyjy pos_uuklje tok_vzwcez
+lem_qtugzm pos_rtagrp tok_wewuqn
+lem_dgpeht pos_kowrmu tok_kdrmep
+lem_ywgaxh pos_lrhasu tok_oyxara
+lem_tjtpyv pos_qkftmv tok_rfocys
+lem_yodrrt pos_kqfomy tok_hikxzm
+lem_xqsoty pos_usrjwj tok_nzsrey
+lem_vahsmi pos_jdhxzz tok_hzssny
+
+lem_sxqrgo pos_utfkbp tok_pgmuak
+lem_upulmg pos_ogkzfn tok_vpwuna
+lem_qxxzra pos_kzofah tok_mojdrd
+lem_stteyb pos_whjryn tok_ahhjhx
+lem_imlvws pos_xkjjyv tok_tpeqhp
+lem_mkqbql pos_srhtfo tok_tjejdh
+lem_srdtbv pos_chloqp tok_ssnijc
+lem_uswkyg pos_vdicyb tok_xqjhmz
+lem_krlhld pos_xgtbww tok_crcazr
+lem_rgustp pos_hpmkfb tok_kbrimt
+
+lem_yztwhb pos_hxwygf tok_vmlqny
+lem_qsljtl pos_okueau tok_uscadu
+lem_sqcljt pos_yoeuuo tok_bufmmk
+lem_vwdkvz pos_zvlxdc tok_iyfjxz
+lem_fhyxld pos_iwjeqh tok_pxgswn
+lem_nmyyde pos_rtgmic tok_rpznih
+lem_nmryce pos_lqrnqi tok_oyzlal
+lem_veooxl pos_dgbfni tok_xspltr
+lem_yqdboa pos_wlbwhs tok_xykzxk
+lem_egjpoj pos_poklar tok_iaongs
+
+lem_oqhmht pos_kokwdy tok_eysfod
+lem_niayce pos_qtekzp tok_vzxvlo
+lem_wpeowt pos_odrqay tok_jxcrtx
+lem_ojjzcg pos_apfqio tok_izeatu
+lem_hgumyt pos_fdqqzt tok_vjshjc
+lem_qqxfci pos_tymmed tok_pluasb
+lem_vjzegn pos_uljqtm tok_iaxkyv
+lem_bsrgfs pos_wtqduj tok_krqfsu
+lem_ioolrt pos_tafxue tok_egcrqb
+lem_wlpoqm pos_mvcxyw tok_qhiusm
+
+lem_ijbaop pos_splybr tok_mqyixf
+lem_laujzm pos_bzzmmf tok_faczbs
+lem_wyagin pos_aztpls tok_jlxjra
+lem_auhaiu pos_uiuvju tok_bkecxr
+lem_dwkxfr pos_eqvcxg tok_ynnpma
+lem_puxkvv pos_vxdutd tok_tjwtec
+lem_ldvyty pos_msmwey tok_kmbshh
+lem_iuqfhr pos_kumdhv tok_mlmqvt
+lem_hbrwyy pos_zixmcl tok_zdgdce
+lem_ilzezj pos_twxphc tok_yjnjwz
+
diff --git a/tests/test_data/generate.py b/tests/test_data/generate.py
index 0047fc3..a79086e 100644
--- a/tests/test_data/generate.py
+++ b/tests/test_data/generate.py
@@ -1,6 +1,20 @@
import random
import string
+# Change one variable to regenerate a test file
+generate_implicit = False
+generate_clitics = False
+generate_roman = False
+generate_skip = False
+generate_replacement = False
+generate_disambiguation = False
+generate_file = False
+generate_empty_line = False
+generate_sentence = False
+generate_window = False
+generate_capitalize = False
+generate_generic = True
+
ROMAN_NUMERAL_TABLE = [
("M", 1000), ("CM", 900), ("D", 500),
("CD", 400), ("C", 100), ("XC", 90),
@@ -27,9 +41,9 @@ def randomString(start: str = "", length: int = 10) -> str:
letters = string.ascii_lowercase
return start+''.join(random.choice(letters) for i in range(length-len(start)))
-# I am using different numbers of sequence size to be sure tests are not being right on another corpus
-if False: # Make false to remove running
+# I am using different numbers of sequence size to be sure tests are not being right on another corpus
+if generate_window: # Make false to remove running
with open("window.tsv", "w") as f:
f.write("lem\tpos\ttok\n")
for lines in range(200):
@@ -39,7 +53,7 @@ def randomString(start: str = "", length: int = 10) -> str:
rand3=randomString(start="tok_", length=10)
))
-if False:
+if generate_sentence:
with open("sentence.tsv", "w") as f:
f.write("lem\tpos\ttok\n")
for lines in range(190):
@@ -52,7 +66,8 @@ def randomString(start: str = "", length: int = 10) -> str:
rand3=randomString(start="tok_", length=10)
))
-if False:
+
+if generate_empty_line:
with open("empty_line.tsv", "w") as f:
f.write("lem\tpos\ttok\n")
for lines in range(180):
@@ -64,7 +79,7 @@ def randomString(start: str = "", length: int = 10) -> str:
if (lines + 1) % 18 == 0:
f.write("\n")
-if False:
+if generate_file:
with open("file.tsv", "w") as f:
f.write("lem\tpos\ttok\n")
for lines in range(170):
@@ -74,7 +89,7 @@ def randomString(start: str = "", length: int = 10) -> str:
rand3=randomString(start="tok_", length=10)
))
-if False:
+if generate_implicit:
with open("implicit.tsv", "w") as f:
for lines in range(160):
f.write("{rand1}\t{rand2}\t{rand3}\n".format(
@@ -83,7 +98,8 @@ def randomString(start: str = "", length: int = 10) -> str:
rand3=randomString(start="tok_", length=10)
))
-if False:
+
+if generate_disambiguation:
with open("disambiguation.tsv", "w") as f:
for lines in range(150):
f.write("{rand1}\t{rand2}\t{rand3}\n".format(
@@ -92,7 +108,7 @@ def randomString(start: str = "", length: int = 10) -> str:
rand3=randomString(start="tok_", length=10)+str(random.randint(0, 10))
))
-if False:
+if generate_replacement:
at_least_one_zero = False
at_least_one_one = False
with open("replacement.tsv", "w") as f:
@@ -118,7 +134,7 @@ def randomString(start: str = "", length: int = 10) -> str:
rand3=randomString(start="tok_", length=10)
))
-if False:
+if generate_skip:
at_least_one_zero = False
at_least_one_one = False
with open("skip.tsv", "w") as f:
@@ -146,7 +162,7 @@ def randomString(start: str = "", length: int = 10) -> str:
f.write("\n")
-if False:
+if generate_roman:
at_least_one_zero = False
at_least_one_one = False
with open("roman_numbers.tsv", "w") as f:
@@ -168,8 +184,7 @@ def randomString(start: str = "", length: int = 10) -> str:
if (lines + 1) % 10 == 0: # Window of 10...
f.write("\n")
-
-if True:
+if generate_clitics:
with open("clitics.tsv", "w") as f:
for lines in range(300):
if (lines + 1) % 5 == 0:
@@ -181,5 +196,29 @@ def randomString(start: str = "", length: int = 10) -> str:
rand3=randomString(start="tok_", length=10)
))
- if (lines + 1) % 10 == 0: # Window of 10...
+ if (lines + 1) % 10 == 0: # Window of 10...
+ f.write("\n")
+
+if generate_capitalize:
+ with open("capitalize.tsv", "w") as f:
+ for lines in range(500):
+ f.write("{rand1}\t{rand2}\t{rand3}\n".format(
+ rand1=randomString(start="lem_", length=10),
+ rand2=randomString(start="pos_", length=10),
+ rand3=randomString(start="tok_", length=10)
+ ))
+
+ if (lines + 1) % 10 == 0: # Window of 10...
+ f.write("\n")
+
+if generate_generic:
+ with open("generic.tsv", "w") as f:
+ for lines in range(500):
+ f.write("{rand1}\t{rand2}\t{rand3}\n".format(
+ rand1=randomString(start="lem_", length=10),
+ rand2=randomString(start="pos_", length=10),
+ rand3=randomString(start="tok_", length=10)
+ ))
+
+ if (lines + 1) % 10 == 0: # Window of 10...
f.write("\n")
diff --git a/tests/test_data/generic.tsv b/tests/test_data/generic.tsv
new file mode 100644
index 0000000..322562d
--- /dev/null
+++ b/tests/test_data/generic.tsv
@@ -0,0 +1,550 @@
+lem_pnglqu pos_asmebe tok_psaspz
+lem_mebgbd pos_zvlbfv tok_ezxvuu
+lem_xzqkny pos_mavawc tok_bxenxg
+lem_ghngfg pos_pxvlyl tok_lvodeu
+lem_ylhvck pos_iltuul tok_mmqtsd
+lem_wybloq pos_jcxlix tok_gdukqy
+lem_bghzga pos_jcgdsc tok_bsormp
+lem_fxthyj pos_nibtuj tok_lnvnfl
+lem_axspcv pos_btxixe tok_ppdtve
+lem_xewmht pos_kvybim tok_fuhesp
+
+lem_myzkgo pos_lxynbd tok_cpjqfb
+lem_ojtejj pos_guuaor tok_fvrhab
+lem_mmukhu pos_mqfvnc tok_fnwube
+lem_tnsgmx pos_anxfmz tok_ktfmas
+lem_tmluif pos_agcygo tok_gtksyi
+lem_iiudev pos_wsuupd tok_fnftzx
+lem_tsytjo pos_yecchj tok_gzaqyb
+lem_kbrglb pos_jrsyty tok_lhonek
+lem_dbvtjj pos_xuqywg tok_izhkrh
+lem_ehymrj pos_jconao tok_gimfgp
+
+lem_qoxvdx pos_xelnad tok_fbzqzj
+lem_yvdvaj pos_xfysrm tok_lfcgll
+lem_ygqkqo pos_pmbbbm tok_tnxqzf
+lem_lgcvjs pos_wkecbv tok_ecwrkb
+lem_wlwibu pos_mxabba tok_howeal
+lem_qezoyp pos_ehsvhq tok_aehlox
+lem_gmgvbl pos_xjywci tok_ndleah
+lem_dukmse pos_boqhzt tok_jyqpha
+lem_yncpeu pos_qmdcur tok_luvxba
+lem_wccpzf pos_dwuage tok_gsfpvu
+
+lem_zgaeuh pos_iebabk tok_vjpipo
+lem_egiwli pos_pneaav tok_lwncsp
+lem_etdddt pos_dfslho tok_tcvxup
+lem_piujkq pos_jbtkzg tok_ocawyd
+lem_nvowjp pos_zzflcu tok_mgwyig
+lem_dzbokl pos_bgtmyq tok_btcxaa
+lem_uvunqw pos_mjgnwg tok_rmjtta
+lem_iiggtz pos_mpxicz tok_bziitq
+lem_oqkxln pos_stzzex tok_epkfvy
+lem_vaglxh pos_jqljvu tok_jhcyqp
+
+lem_zrdaut pos_azcbgd tok_eqspfr
+lem_pahaia pos_islakf tok_vadzpo
+lem_nrgboa pos_ljxzgm tok_dxbdmx
+lem_rdipvv pos_mdnylf tok_cuubbc
+lem_oshqgz pos_xgwkmi tok_gksedp
+lem_aeylbl pos_vntgwy tok_ujfpcr
+lem_kvzcce pos_cliawz tok_lsynrz
+lem_jrwgfn pos_clhgqa tok_ilruyc
+lem_mvjszq pos_agbbcd tok_clcciy
+lem_aupvma pos_wwlmdh tok_xoalkn
+
+lem_xnlhfr pos_rhevie tok_zekaok
+lem_dhehjw pos_aurleh tok_nmakyf
+lem_ankaqj pos_esuowx tok_rjltim
+lem_vyugto pos_ztvzxd tok_nkzhsc
+lem_uetvbf pos_dixkyh tok_gpxwdy
+lem_wgjqvl pos_fbijwc tok_uhpgmb
+lem_lbmsla pos_qzipnp tok_qwqcyy
+lem_rdenyx pos_smgvkc tok_oojuer
+lem_hiwtug pos_osgiia tok_kgrbky
+lem_xvlgbu pos_gbrncm tok_rhcbnf
+
+lem_rrijyv pos_hhwgaj tok_hglsmo
+lem_mhfrtc pos_gseabj tok_gvbsrc
+lem_uiljox pos_grhpem tok_fkrfzf
+lem_jmrrud pos_naxznl tok_suyxme
+lem_whypap pos_nobacl tok_cpbqgd
+lem_msajlz pos_pyoxmm tok_wfjhwj
+lem_ettsyw pos_gbmuzc tok_lapskd
+lem_gujqny pos_hcrckt tok_gekwdi
+lem_mpagjz pos_mbpjcl tok_dfwbdx
+lem_ldreti pos_lkiypk tok_kvcurb
+
+lem_vnypun pos_gkjdef tok_zqopre
+lem_yhzsnm pos_lowabu tok_ifurtd
+lem_svgolo pos_yemjht tok_mozohp
+lem_hnncpk pos_jnpizv tok_uujfmo
+lem_pbxxgy pos_rwmksa tok_tbgelf
+lem_alxytf pos_nmrber tok_trwajw
+lem_rcmzct pos_dnbxuc tok_zjgbht
+lem_slzeww pos_ueibpj tok_spnicz
+lem_oatvra pos_jybjfq tok_anaayc
+lem_zdpeyz pos_cebrvz tok_jthzal
+
+lem_bntofm pos_jngleg tok_qdpsqo
+lem_fmrnyq pos_tveybk tok_qywgjg
+lem_jfzdao pos_efkukb tok_terush
+lem_msnztv pos_avckvf tok_rpdgdm
+lem_vlqlqf pos_vcmecr tok_psjooy
+lem_fjbpev pos_wtdmbr tok_duslbg
+lem_wieira pos_ydinmw tok_gxpvdk
+lem_cghfnf pos_nlgplz tok_yhqkbd
+lem_rludli pos_awqmwk tok_rvkvog
+lem_msfyup pos_niipwh tok_xysaww
+
+lem_xvlzpl pos_npdsdr tok_hozwpw
+lem_ezwduk pos_bopbho tok_ojphac
+lem_gyjwdv pos_bulihx tok_ybwnla
+lem_fxfuvg pos_gwrsht tok_iufari
+lem_hvhhvu pos_bfvbqu tok_wrugav
+lem_virrik pos_bfusid tok_osubgh
+lem_igtllm pos_zsmlgd tok_vrguwy
+lem_yvxfyo pos_foicpg tok_omidwp
+lem_onshyh pos_huzxvu tok_xoqpol
+lem_twcwrd pos_xmmyii tok_apeqno
+
+lem_hwlcbf pos_wvxtpu tok_cwstqs
+lem_idwhly pos_uxqvdq tok_wadnqq
+lem_ghbdfe pos_tdbmiz tok_hcxois
+lem_bfsrey pos_yatuxi tok_uoyzjy
+lem_nexzlq pos_rnxyvz tok_xvruyn
+lem_hobbrr pos_piiaix tok_uniyqj
+lem_lpvloc pos_axnzeh tok_zatdex
+lem_wczgrh pos_upuwir tok_nqbrli
+lem_gosogu pos_kkbarj tok_xizwwg
+lem_oeprnt pos_olivpj tok_clepiu
+
+lem_omqxtp pos_kehvtp tok_hrerte
+lem_pnowgn pos_etrjvq tok_mlwjjy
+lem_zshfka pos_bjqaoy tok_kjwvbg
+lem_rjxsbn pos_pwonvp tok_ssslob
+lem_fwxdnl pos_kujnyj tok_gcqhia
+lem_rdaszr pos_psouih tok_gjshkw
+lem_qczbdv pos_lyfkes tok_judfss
+lem_usczau pos_buqpls tok_pkjdmx
+lem_uytagi pos_uqntms tok_lrjnln
+lem_rwgfjv pos_peezza tok_dukgtz
+
+lem_lyyhwq pos_iwhhxe tok_voswhx
+lem_sewohc pos_eszmdr tok_mzgiqc
+lem_fzvavh pos_xfnjdb tok_ytuboz
+lem_ujyrdr pos_hmmeaw tok_emmmzw
+lem_qfdkqh pos_oakuls tok_zjaurs
+lem_iglmrn pos_moqchd tok_uebens
+lem_grrils pos_xnvdit tok_kxkccr
+lem_rahztc pos_kvfbjb tok_klpafn
+lem_oujgyi pos_iskgln tok_vcxpps
+lem_mdrbdm pos_icrxxn tok_astqym
+
+lem_zvymaz pos_nqchpv tok_bdxugf
+lem_doafwx pos_eeensz tok_ijygwa
+lem_ecnrtu pos_kdivkj tok_ewgxfw
+lem_vlzbui pos_ykyihk tok_logkvy
+lem_wljjnn pos_ejrqux tok_qbswct
+lem_jqktac pos_mvpkvz tok_rfyqtq
+lem_miucap pos_emmvel tok_blubai
+lem_qkspdn pos_lagrww tok_mfacah
+lem_pyyoag pos_yixzbw tok_wsqjlc
+lem_fahftc pos_yebico tok_ohzfke
+
+lem_vbnhkb pos_mkdjjo tok_sifeia
+lem_tfythp pos_wxxaqg tok_bhvhgg
+lem_qhekoq pos_hnqqgd tok_lkfkvl
+lem_juynha pos_zwyxbj tok_vmewqc
+lem_xhvadu pos_tsubsg tok_ejaswn
+lem_nwacfk pos_avqxhm tok_dxzihi
+lem_mlkidd pos_xtnfei tok_tvurhr
+lem_mjlcuz pos_fsxxuf tok_tvxcyq
+lem_elurab pos_fkeuji tok_kodcqh
+lem_jhtvfa pos_riwebj tok_wyfllm
+
+lem_ehyuab pos_azurxz tok_lzhzlr
+lem_hesbez pos_cuhpuz tok_ovolhm
+lem_vpwjcg pos_ejgsmo tok_yabopn
+lem_bdcgse pos_bzuojo tok_qdxkdx
+lem_sajlgi pos_uqxyji tok_ffmpgu
+lem_atdhzb pos_renqzu tok_plhuik
+lem_wycrgt pos_ptrfbb tok_fcyifc
+lem_idvbho pos_ucnwzy tok_qpqnlw
+lem_ghsmbz pos_fatumo tok_kbffal
+lem_jcynpn pos_yltenm tok_kczrkz
+
+lem_dvweer pos_krdjep tok_yguwcv
+lem_mudjnq pos_uviakt tok_murppk
+lem_wjukub pos_yggtkn tok_zbdcjc
+lem_znvoew pos_zgfwer tok_vqpnit
+lem_gdaykn pos_abktqv tok_oytwun
+lem_jukrnn pos_pkuqyu tok_jsyslm
+lem_tzlegg pos_kmoxfo tok_jnqtpq
+lem_ffpebz pos_cxabab tok_olouxz
+lem_ljlbis pos_gvewxv tok_hvhgut
+lem_yvhmck pos_bmtppc tok_upbyeu
+
+lem_gtpqpp pos_vgukly tok_bkxnlu
+lem_wkpmqw pos_klxemb tok_cgkcqs
+lem_mprrnf pos_zwwksf tok_lldawn
+lem_gjstsp pos_lyktlo tok_eqdswb
+lem_iawrhv pos_jnxifc tok_jcoyys
+lem_gikbbb pos_hcfykc tok_jhbaaj
+lem_rworir pos_menbpu tok_lpthvx
+lem_idxscs pos_msnpth tok_hjydha
+lem_gxkksx pos_hgwwvg tok_ambhzx
+lem_wrurbc pos_deytsp tok_bsdcgw
+
+lem_ciifzj pos_rwklqg tok_annfjf
+lem_hxlapr pos_ytijwd tok_uwfoew
+lem_dakvmb pos_dueirt tok_lkiyjy
+lem_orwffp pos_xvpdsd tok_ymjtmn
+lem_jowzgo pos_fjxiec tok_mtndsk
+lem_ccsjxk pos_mnynch tok_icfdkp
+lem_bckheg pos_iuwwfb tok_gbsnmi
+lem_qnrtpk pos_nkaibd tok_zptrjo
+lem_ylyodz pos_qusyxo tok_rzoafu
+lem_huscow pos_mkjxjo tok_hvpcub
+
+lem_isigco pos_abwiiz tok_zcjhzt
+lem_gctpta pos_vlwfme tok_rciavv
+lem_rhutpw pos_dqbkuz tok_epwnzx
+lem_nokcra pos_bqhndh tok_gawuji
+lem_excjus pos_vjznov tok_wfysra
+lem_vztfoz pos_qnpmdl tok_darejq
+lem_nuxexn pos_kiieeg tok_gklnsq
+lem_tqzdhk pos_iynvos tok_tfvtlx
+lem_opwkfs pos_rrpisf tok_nfvfru
+lem_pyqpjx pos_txlaci tok_qbuzxu
+
+lem_jlhbyd pos_ngsqnp tok_zebvyu
+lem_fskqwz pos_ueoqsk tok_lincdm
+lem_degete pos_lpojkc tok_twqmod
+lem_agmtcx pos_twimub tok_nfiavc
+lem_cjwuzk pos_oxtsip tok_bbeimy
+lem_fyawqu pos_kyewnd tok_zolwky
+lem_bysedi pos_glratz tok_gwayzf
+lem_gnlioz pos_iwuvdv tok_onsdpz
+lem_cpydbn pos_mibnmt tok_ebxmia
+lem_bcfyxt pos_pxcgsc tok_sfjfzn
+
+lem_uytpgs pos_yoikuk tok_ctkvjv
+lem_iecopt pos_wjfxqb tok_egrvxp
+lem_uuxivj pos_bzxynj tok_vmgvrq
+lem_vwiknd pos_wcphdo tok_puzlfa
+lem_sikxax pos_jbuzzz tok_lsomuh
+lem_hpexss pos_noxjaa tok_ekffht
+lem_cuvsiw pos_hfknjd tok_yeppee
+lem_jzxaay pos_zhsbsm tok_ulcoos
+lem_ktchac pos_ljzhrp tok_xsfahv
+lem_nwbqio pos_kunfjr tok_twpklh
+
+lem_algfvf pos_ooxfml tok_ktfqvq
+lem_qmdtds pos_eaautl tok_dbsdhv
+lem_khowpj pos_jvoqzc tok_fzgxgz
+lem_wkpwee pos_ykrfap tok_plwcmc
+lem_vytmpx pos_jyysyv tok_thcmet
+lem_uotpzs pos_ywwhld tok_mcfcnl
+lem_eaofao pos_lakpdh tok_vorgrd
+lem_rgcrnu pos_ppzwsq tok_fxqopo
+lem_dipjtd pos_ncihjq tok_ltkzwj
+lem_vjjtnl pos_yggydp tok_nqxhll
+
+lem_ydzust pos_nljkuc tok_amqnhf
+lem_tyykgb pos_rqbuvy tok_zedoun
+lem_sbanpt pos_ugeqel tok_ggerie
+lem_fhrxzr pos_zvbfxl tok_nqhlhi
+lem_uzmbex pos_pvonjh tok_opyqza
+lem_brjisl pos_nzzgco tok_zntdob
+lem_aitmgy pos_twvcov tok_zzlpky
+lem_dftehu pos_maytuh tok_ewfbks
+lem_iyipca pos_yjoihd tok_cizbri
+lem_crwgij pos_kjcdrc tok_pcmcaf
+
+lem_zkgula pos_nqngve tok_qljpsw
+lem_xgvola pos_ggnrjm tok_txuwre
+lem_lmvodr pos_cateld tok_piyxeo
+lem_hkbfmb pos_hbladr tok_kevjhg
+lem_tpvral pos_apwgxt tok_tnpipv
+lem_dplpey pos_labudw tok_jlxdil
+lem_wsjnft pos_yrsptw tok_zmijcd
+lem_guvspa pos_tkpzap tok_xkzfet
+lem_udchdk pos_jxbyjy tok_jyubyl
+lem_woubce pos_ygdkii tok_naxuga
+
+lem_jxhmdx pos_pecvlq tok_lctaho
+lem_soswkd pos_xxzgwj tok_zxfrkj
+lem_lmbzbz pos_woeygv tok_egvriu
+lem_nnpdvu pos_tdzlzl tok_ihvkzw
+lem_lzjtpd pos_cbwxri tok_ktupgh
+lem_poxgkh pos_pgiuvq tok_jtdwrs
+lem_tdtcgu pos_xrcwcn tok_imzzpy
+lem_zsvhuq pos_tuxjki tok_onneiv
+lem_cgawuf pos_aswqfa tok_hpdanq
+lem_dkjtaw pos_bufaos tok_bmejgh
+
+lem_lndkha pos_bjxvvx tok_vjpywh
+lem_cgwcqb pos_pqzsni tok_arhatp
+lem_rxsvoc pos_eevidh tok_kasnfm
+lem_iuiyhm pos_dehgbs tok_zvrwvr
+lem_alnoro pos_xbiwuh tok_lmwuex
+lem_ndpkrs pos_evinyq tok_gozthy
+lem_cfloip pos_iykkfq tok_vfggug
+lem_puwsps pos_cavuxb tok_ojlpen
+lem_cvbazd pos_drtwjs tok_tdpfhx
+lem_xaadjv pos_xkmxfh tok_knvudq
+
+lem_pttysa pos_xfmfja tok_erdylq
+lem_tmhaob pos_eebvst tok_yocmuk
+lem_iqwvzf pos_tbnxrw tok_cwnfsq
+lem_ksgmdv pos_dphlbz tok_wgsina
+lem_luhtpp pos_obecbu tok_alswtw
+lem_avbqso pos_muzxbj tok_narjmi
+lem_vtripz pos_rmwpdt tok_ccncei
+lem_qiisvu pos_zobxzd tok_jvitkg
+lem_thuxyf pos_ziagxk tok_kiwher
+lem_yunaef pos_bkgvxi tok_uwdpvh
+
+lem_hkvyaf pos_ilupbo tok_ffepqv
+lem_dtxwau pos_qmieqe tok_rpwihf
+lem_zfqxvf pos_pfckzl tok_byiame
+lem_nqhasq pos_wmncaz tok_eiyncu
+lem_osdxsa pos_ivzmal tok_xakyhq
+lem_nbhrce pos_annonw tok_lwivqn
+lem_dtjuky pos_xwdjjx tok_ududim
+lem_tglzqo pos_pidjzx tok_lvymnw
+lem_yhcpqk pos_bbhgdh tok_blyybp
+lem_gdkhkf pos_arsxhu tok_sprifv
+
+lem_udognq pos_ssvewy tok_pgnozm
+lem_mkiuvw pos_mgpdhc tok_mmshwd
+lem_fyubbp pos_johwvl tok_ndfvgs
+lem_srkpmu pos_kciexw tok_ybiiof
+lem_thuvck pos_disjgx tok_glmmva
+lem_arpojd pos_lqhzix tok_mpwayz
+lem_pajszf pos_plwzrp tok_mfvehu
+lem_fiobxw pos_bmjtzg tok_dstmev
+lem_hwrxka pos_pmfuin tok_ucysgx
+lem_bvyhch pos_ueiehv tok_ubagoz
+
+lem_udgjbg pos_bppiow tok_sxvxcq
+lem_umkvig pos_wqgkvj tok_zgudxp
+lem_ahqwdc pos_jtxlag tok_pbahrd
+lem_qphmfc pos_bugjkt tok_gssjyy
+lem_svklvx pos_roqynz tok_tdwhnk
+lem_qmzrjk pos_wsztno tok_apttqt
+lem_deiapn pos_cjzjph tok_zvejtl
+lem_fhwkce pos_suwufb tok_hxinbh
+lem_imtorp pos_kuleta tok_icovji
+lem_weovxt pos_vdxwzf tok_gtcoay
+
+lem_qyglsd pos_mdmrnr tok_hjhtdd
+lem_jyygdn pos_ryvyfq tok_fupiof
+lem_okdbuv pos_pqmugs tok_xrnwfs
+lem_hwgsim pos_edglig tok_thojbh
+lem_ctnxbc pos_ilplpu tok_okedji
+lem_ldhlea pos_syjfsv tok_cssiqu
+lem_czqjry pos_qoxlhi tok_glfpbn
+lem_ibblvg pos_mlapuz tok_icmypi
+lem_whofjg pos_qzxezo tok_bdtqyr
+lem_ksvdnm pos_ianhmx tok_byqpnq
+
+lem_himgmt pos_vsfytf tok_gtoyzp
+lem_yzbvgk pos_hactyl tok_mihsrx
+lem_dxyxpo pos_vqrmcs tok_afdnmp
+lem_zygnah pos_hguzgi tok_gybiwc
+lem_orlilu pos_voyadg tok_bbahtj
+lem_jihriy pos_zzjdfd tok_uhbjjl
+lem_jxxbqe pos_xmwjpi tok_exipxk
+lem_cwclce pos_ttlmmc tok_qahigp
+lem_yqingd pos_khxetb tok_yseugj
+lem_fdqudy pos_vtqqka tok_bltzio
+
+lem_ufvglp pos_usbhln tok_invhmt
+lem_ucbcws pos_juxwyh tok_xvidto
+lem_jalnrv pos_kehdve tok_xlrwvm
+lem_wupgkx pos_ytjocr tok_mhcpcu
+lem_ysaiwb pos_zrbkmf tok_dhbygu
+lem_gnheia pos_sctxya tok_zvbkea
+lem_llgqzk pos_xpprym tok_enloag
+lem_rhztyg pos_rdgqdo tok_azjqdx
+lem_ibddtk pos_nvaxik tok_lclhya
+lem_tpvovd pos_yhzvwj tok_icyzhn
+
+lem_nswoot pos_bwvdze tok_jtzcuc
+lem_tlvsms pos_anwnhu tok_gsfzrw
+lem_knrvmq pos_nldxjb tok_tziphw
+lem_khgyeh pos_oikzss tok_ayhvin
+lem_mkauym pos_enawim tok_tsmsus
+lem_oxtqxo pos_wlosqk tok_blffyh
+lem_udbsej pos_alekho tok_axfaat
+lem_kgjfxi pos_cbsgst tok_zflmoz
+lem_ohwipy pos_navjcp tok_rodrfj
+lem_miketf pos_tpkwtu tok_lwgjsx
+
+lem_jabgep pos_rdpekf tok_plrhxh
+lem_epimnv pos_wjilvk tok_hpuobc
+lem_anvpfz pos_srwwtk tok_otyntq
+lem_gnoqea pos_msvort tok_ctwezz
+lem_erqzii pos_enojdo tok_opfqdy
+lem_tpwncs pos_paskoj tok_cesuln
+lem_efzrwp pos_tosvwq tok_sqzwyc
+lem_uqecmi pos_psgrbf tok_uytozr
+lem_oecpmd pos_lopuif tok_grtwkh
+lem_gpbnfh pos_rmsdkd tok_imwchk
+
+lem_hdhfho pos_qpkakd tok_qiijyh
+lem_rgtuow pos_dmyxdb tok_mbpktj
+lem_xwuxrs pos_qbdcig tok_lhzyau
+lem_rdhbzp pos_upvmfn tok_bfzjnn
+lem_thycqp pos_sxvtex tok_jgfpgb
+lem_nwuipp pos_gojwjp tok_ojmdev
+lem_pclorh pos_wycfox tok_tdgvwl
+lem_hhrpjx pos_vpofjw tok_svboji
+lem_zhiugr pos_ssbdgn tok_jzfhje
+lem_dgxepp pos_aqvvkq tok_aophlm
+
+lem_djcgnc pos_umfujg tok_jvctze
+lem_ivtuib pos_gccaun tok_fyvvsa
+lem_xrldqk pos_tifgzp tok_qemkvl
+lem_fgcwen pos_updhex tok_gqwrty
+lem_jfjktp pos_smlgbr tok_supebj
+lem_qkrnfh pos_klflpm tok_cgqbzr
+lem_hqjwgr pos_cscpfc tok_tmcjwt
+lem_zekjfs pos_xioike tok_jofgaj
+lem_lbdjep pos_jkawyi tok_xbkbuv
+lem_qccvpl pos_mqjgkg tok_prxlbr
+
+lem_lqbcxe pos_marmcp tok_mvmxoj
+lem_hhfrfe pos_upowxv tok_btqcjb
+lem_kxnbox pos_vlfamx tok_gezizk
+lem_yjteio pos_hgdikt tok_cczjyb
+lem_pyrppz pos_ziuyfg tok_kpyiia
+lem_xxatad pos_abteyo tok_ahluuw
+lem_rnqkiw pos_ilnrbf tok_mmghro
+lem_btfstz pos_bisgck tok_oqlppr
+lem_idxuwc pos_xdmidv tok_axlwpx
+lem_bytmdu pos_nltcuk tok_scnzqo
+
+lem_epyhkc pos_iixjck tok_eqxaco
+lem_nozaxl pos_uwxufj tok_nffdpa
+lem_ajycum pos_cariev tok_ptdrop
+lem_huejvt pos_ywpqdr tok_zsqdxa
+lem_kdpogh pos_xiwzrj tok_cqvpdj
+lem_qnmbsb pos_xykqzu tok_jxriow
+lem_oendok pos_emkvie tok_spxunf
+lem_xatswy pos_gvdzfv tok_paisus
+lem_yhjped pos_dveoqj tok_lmogyx
+lem_rjcuzx pos_cgjnsm tok_kkmfce
+
+lem_dpkafn pos_iisyjw tok_dbdgpw
+lem_oeigbc pos_mhfybc tok_edswto
+lem_cewjzn pos_cdympu tok_dydedt
+lem_clclzd pos_ywtnbn tok_lwhddh
+lem_zuxviz pos_xaexkk tok_oiekam
+lem_yadubx pos_afjngs tok_keyojz
+lem_qwrqfx pos_xfonvq tok_ajzkut
+lem_rpdtbc pos_agjxmq tok_bbrfdk
+lem_vyvsec pos_mygmil tok_hdllgn
+lem_otqfxh pos_dqnkst tok_jkayri
+
+lem_tmfzdg pos_awwegn tok_hqgbnu
+lem_ogmywj pos_tdzfkg tok_kabnzk
+lem_sdehke pos_xlysst tok_zmzjes
+lem_yggffr pos_aihjwe tok_bkyzyd
+lem_lnpsgh pos_pukddo tok_taigpe
+lem_tqpkkq pos_qobykg tok_ifpuye
+lem_frawzk pos_qgztxb tok_wbbxgh
+lem_qgmyqi pos_uaegvo tok_svjheg
+lem_yljbao pos_cpnego tok_zjurjx
+lem_nondda pos_ogapcg tok_uwnumi
+
+lem_izzxzv pos_ecnnyc tok_xoryrx
+lem_xspbqn pos_duhumn tok_ilopbl
+lem_eosiwr pos_udotyh tok_yxhhhj
+lem_thzxnm pos_gkbxwa tok_tpfjrw
+lem_tywvba pos_tekyht tok_jokgkd
+lem_gpnqba pos_ndzpcl tok_bmxyks
+lem_eyclfq pos_qstzwe tok_kwdged
+lem_ryxiar pos_pfurva tok_vfwhix
+lem_zbluyg pos_hqolyd tok_jdmtnh
+lem_oimpxn pos_dououy tok_mqijzw
+
+lem_qfrejv pos_diuani tok_lifrov
+lem_bwhido pos_wqnomk tok_aeewci
+lem_ofwfmp pos_kagjkt tok_lkqklj
+lem_tqyuzs pos_salzpj tok_xasuvv
+lem_imqbao pos_hbjkmj tok_ncbrvm
+lem_zyrhtv pos_hqonsn tok_tgaidu
+lem_xfsksc pos_zaoeyb tok_jvemap
+lem_cocsqq pos_iwxbbk tok_jowofp
+lem_dcnzdm pos_spcvwy tok_uhjbnq
+lem_zudmze pos_fypyww tok_umyuzz
+
+lem_ucfaba pos_fdjzot tok_zbhmdd
+lem_kbmwda pos_ophkmd tok_tuiafv
+lem_fyjlbt pos_vxjxbm tok_bedifk
+lem_wubgiz pos_snuzfx tok_guwjbi
+lem_fpduuj pos_zzgfkk tok_tqskbu
+lem_xmxwno pos_svzizv tok_khdyci
+lem_nfdqbr pos_ptzmhs tok_xphjhk
+lem_lewrig pos_dfgtjx tok_vbtvjf
+lem_ttyfwj pos_iwcucd tok_uybvvd
+lem_dthqvz pos_mwlofb tok_fmwfis
+
+lem_pqbthq pos_rvtmlk tok_jvnsnk
+lem_lzvnww pos_vpxejg tok_vukmwi
+lem_dnllbe pos_kvgegz tok_qbsvin
+lem_wnzluu pos_zqpjes tok_fzbrae
+lem_mgggxv pos_jztwzk tok_pugngw
+lem_kgkmwh pos_avytiv tok_vxxpqq
+lem_ztqbjg pos_dfntss tok_vjnmmg
+lem_vzrzhp pos_lssare tok_lfbuwl
+lem_mjsuko pos_uyvybl tok_wdwwvp
+lem_nawtkw pos_igxvzt tok_otwnbp
+
+lem_nlanfn pos_hlrigm tok_plhxfs
+lem_gqwksn pos_zcnxuy tok_fuxjiy
+lem_qzihsn pos_ejoxpr tok_whpuxl
+lem_kdosgv pos_fvzwxx tok_rkgdfb
+lem_yuiyxw pos_dmbjzv tok_vzrqqy
+lem_okiere pos_aqjfpz tok_ssvyzp
+lem_hjwthb pos_kpkaen tok_jpjemk
+lem_zhfyyx pos_vgqepe tok_chiprc
+lem_dnxibl pos_scliog tok_maaums
+lem_slgmzy pos_kpfqri tok_vaxmde
+
+lem_saxbeh pos_lzupsf tok_enlbne
+lem_ytebof pos_nzyxsk tok_seizwi
+lem_axienz pos_ejwaro tok_yoaijm
+lem_fyxwoy pos_jpovml tok_kqbuva
+lem_cqoppk pos_frghuo tok_xoyduc
+lem_dshhqe pos_zzolcy tok_bpbszd
+lem_gyirrf pos_wfztqk tok_htkzmr
+lem_kbiqrd pos_kssngk tok_ufdahd
+lem_zqdegm pos_qvhoqu tok_avllty
+lem_qlvfdy pos_ydszro tok_wsotit
+
+lem_xgtftk pos_yqdcsh tok_eytugv
+lem_zejrgy pos_ojcikn tok_utwapj
+lem_ykyiif pos_fhptrb tok_dkjucx
+lem_quepzd pos_kqszqm tok_aqumhv
+lem_zjgvew pos_xygxzf tok_nyzgtf
+lem_crtmnw pos_ppctbd tok_ybroqp
+lem_vdevvf pos_oufyms tok_ljnoye
+lem_xbprju pos_pswwsd tok_tmvzzc
+lem_fooalk pos_xdgvxe tok_taodtk
+lem_fksxtf pos_mstpaq tok_xiudaf
+
+lem_zjoakx pos_dvmacz tok_nhkgol
+lem_kgylhm pos_xuhevj tok_qwtmly
+lem_lrvbkp pos_xaxdvl tok_kfacse
+lem_sdzqrk pos_ftylmt tok_ikkucj
+lem_xtocvu pos_jgzxuw tok_zigxwf
+lem_ntpfcq pos_gagsxh tok_zlwyba
+lem_jdvitr pos_uxnniv tok_hnjwqs
+lem_vnnrfi pos_hnytho tok_afwlas
+lem_syohyi pos_hbrlqj tok_gehdre
+lem_axzlfm pos_qmvhli tok_llrjke
+
diff --git a/tests/test_postprocessing.py b/tests/test_postprocessing.py
index 47e335c..3f6e38d 100644
--- a/tests/test_postprocessing.py
+++ b/tests/test_postprocessing.py
@@ -2,6 +2,37 @@
class TestPostProcessing(_TestHelper):
+ def _general_config_write(self, postprocessing: str) -> str:
+ with open("./tests/test_config/generated.xml", "w") as f:
+ f.write("""
+
+
+
+
+ {postprocessing}
+
+
+
+
+
+
+
+
+
+
+""".format(postprocessing=postprocessing))
+ return "./tests/test_config/generated.xml"
+
def test_disambiguation(self):
self._dispatch(
output_dir="./tests/tests_output/",
@@ -142,10 +173,162 @@ def test_clitics(self):
self.assertTrue(line["lemma"].endswith("ηne"), "Clitic has been passed to lemma with glue")
self.assertFalse(line["token"].endswith("ηne"), "Clitic has been passed to token without glue")
tokens += 1
- print(line)
- print(tokens)
self.assertEqual(tokens, 300*0.8 * 0.8,
"There should be 80% of total tokens, and 20% of that should have been removed (2 clitics"
"every 10 words)")
self.assertEqual(clitics, 300*0.8*0.2, "There should be 2 clitics for 8 words")
+
+
+class TestCapitalize(TestPostProcessing):
+ """ Check that capitalization are dealt with correctly"""
+
+ def test_capitalize_base(self):
+ out, config = self._dispatch(
+ output_dir="./tests/tests_output/",
+ train=0.8,
+ dev=0.1,
+ test=0.1,
+ config="./tests/test_config/capitalize.xml"
+ )
+ # Checking all corpora just to be sure
+ tokens = 0
+
+ sentences = [[]]
+ for line in self.read_file("train", "capitalize.tsv"):
+ if not line:
+ sentences.append([])
+ else:
+ sentences[-1].append(line)
+ tokens += 1
+
+ sentences = [s for s in sentences if s]
+ self.assertEqual(
+ [True] * len(sentences),
+ [s[0]["token"][0].isupper() for s in sentences],
+ "The first word of every sentence should be capitalized"
+ )
+
+ self.assertEqual(tokens, 500 * 0.8, "There should be 80% of total tokens")
+
+ # Test half
+
+ def test_capitalize_random(self):
+ conf = self._general_config_write("""
+
+
+
+
+
+ """)
+ out, config = self._dispatch(
+ output_dir="./tests/tests_output/",
+ train=0.8,
+ dev=0.1,
+ test=0.1,
+ config=conf
+ )
+ tokens = 0
+ sentences = [[]]
+ for line in self.read_file("train", "generic.tsv"):
+ if not line:
+ sentences.append([])
+ else:
+ sentences[-1].append(line)
+ tokens += 1
+
+ sentences = [s for s in sentences if s]
+
+ half_toks = round(500 * 0.8 * 0.5)
+ half_chunks = round(half_toks / 10)
+ self.assertEqual(
+ sorted([True] * half_chunks + [False] * half_chunks),
+ sorted([s[0]["token"][0].isupper() for s in sentences]),
+ "The first word of every sentence should be capitalized"
+ )
+
+ self.assertEqual(tokens, 500 * 0.8, "There should be 80% of total tokens")
+
+ def test_random_caps(self):
+ conf = self._general_config_write("""
+
+
+
+
+
+ """)
+
+ out, config = self._dispatch(
+ output_dir="./tests/tests_output/",
+ train=0.8,
+ dev=0.1,
+ test=0.1,
+ config=conf
+ )
+
+ tokens = 0
+ sentences = [[]]
+ for line in self.read_file("train", "generic.tsv"):
+ if not line:
+ sentences.append([])
+ else:
+ sentences[-1].append(line)
+ tokens += 1
+
+ sentences = [s for s in sentences if s]
+
+ nb_tokens = round(500 * 0.8)
+ nb_chunks = round(500 * 0.8 / 10)
+
+ self.assertNotEqual(
+ sorted([True] * nb_chunks),
+ sorted([s[0]["token"][0].isupper() for s in sentences]),
+ "The first word of every sentence should not be capitalized. There is a very small chance that"
+ "this distribution happened. The test would fail in this case..."
+ )
+ self.assertEqual(
+ round(0.3*nb_tokens),
+ [t["token"][0].isupper() for s in sentences for t in s].count(True),
+ "30% of tokens should be Capitalized"
+ )
+
+ self.assertEqual(tokens, 500 * 0.8, "There should be 80% of total tokens")
+
+ def test_capitalized_and_indicator(self):
+ """Ensure that replacement of caps by lowercase letter + SPECIAL CHAR is done"""
+ conf = self._general_config_write("""
+
+
+
+
+
+ """)
+
+ out, config = self._dispatch(
+ output_dir="./tests/tests_output/",
+ train=0.8,
+ dev=0.1,
+ test=0.1,
+ config=conf
+ )
+
+ tokens = 0
+ sentences = [[]]
+ for line in self.read_file("train", "generic.tsv"):
+ if not line:
+ sentences.append([])
+ else:
+ sentences[-1].append(line)
+ tokens += 1
+
+ sentences = [s for s in sentences if s]
+
+ nb_tokens = round(500 * 0.8 * 0.5)
+
+ self.assertEqual(
+ nb_tokens,
+ [t["token"][0].islower() and t["token"][1] == "π¨" for s in sentences for t in s].count(True),
+ "30% of tokens should be Capitalized"
+ )
+
+ self.assertEqual(tokens, 500 * 0.8, "There should be 80% of total tokens")