Merge 33e920d into c347c0d

hipster-philology · Sep 17, 2020 · f5f1636 · f5f1636
2 parents c347c0d + 33e920d
commit f5f1636
Show file tree

Hide file tree

Showing 15 changed files with 1,815 additions and 69 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ memory.csv
 memory*.csv
 new.yaml
 tests/tests_output
+tests/test_config/generated.xml
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md
@@ -361,3 +361,40 @@ It will produces the following output
 
 The glue token is not applied on token, the lemma value is transfered to the previous row and the POS is lost.
 `@glue_char` is used to concatenate columns such as `lemma` here,
+
+### Capitalization
+
+This post-processing function capitalizes (*ie.* makes the first letter of words upper-case) randomly or always first 
+words of chunks (*ie.* sentences) and random letters inside. It also provides an uppercase mask creation, where it
+replaces uppercased letters with lowercase letters the [Neutral Chess Queen UTF-8 character](https://www.compart.com/fr/unicode/U+1FA01).
+
+The model is the following:
+
+```xml
+<config>
+    <!--...-->
+    <postprocessing>
+        <capitalize column-token="token" caps-to-utf8-marker="true">
+            <first-word when="never">
+                <sentence-marker name="empty_line"/>
+            </first-word>
+            <first-letters when="ratio" ratio="0.5"/>
+        </capitalize>
+    </postprocessing>
+    <!--...-->
+</config>
+```
+
+1. <kbd>column-token</kbd> specifies the name of the column containing the raw form of the tokens
+2. (Optional) <kbd>column-lemma</kbd> does the same thing for lemma
+3. <kbd>caps-to-utf8-marker</kbd> activates masking uppercased letters.
+4. <kbd>first-word</kbd> is activated when <kbd>when</kbd> is set to a value between `always`, `random` and `ratio`.
+    1. <kbd>when="ratio"</kbd> requires a second <kbd>ratio</kbd> value which needs to be a float between .0 and 1.0 (a percentage basically)
+    2. <kbd>when=random</kbd> is basically a shortcut for the latter where ratio=0.5
+    3. To identify sentences, you need to set up <kbd>sentence-marker</kbd>
+        1. It can be <kbd>name="empty_line"</kbd>, in which case chunks are separated by empty line (default output)
+        2. It can be <kbd>name="regexp"</kbd>, in which case it takes a `@matchPattern` attribute (for regular expression)
+        and a column that needs to be matched in `@source`, *.ie* `<sentence-matcher name="regexp" matchPattern="[\.!?]" source="lemma" />`
+5. <kbd>first-letters</kbd> works with the same when/ratio attribute than <kbd>first-word</kbd>. It applies said capitalization
+    to random words inside chunks.
+
diff --git a/exemple.xml b/exemple.xml
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<?xml-model href="protogeneia/schema.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<?xml-model href="protogenie/schema.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>
 <config>
     <output>
         <header>

diff --git a/protogenie/cli.py b/protogenie/cli.py
@@ -32,12 +32,13 @@ def cli_scheme(dest):
 @main.command("build")
 @click.argument("file", type=click.Path(exists=True, file_okay=True, dir_okay=False))
 @click.option("--output", default="./output", type=str, help="Directory where the output should be built")
+@click.option("-n", "--no-split", default=False, is_flag=True, help="Does not apply splitting to files")
 @click.option("-c", "--clear", default=False, is_flag=True, help="Clear the output directory")
 @click.option("-t", "--train", "train", default=0.8, type=float, help="Percentage of data to use for training")
 @click.option("-d", "--dev", "dev", default=0., type=float, help="Percentage of data to use for dev set")
 @click.option("-e", "--test", "test", default=0.2, type=float, help="Percentage of data to use for test set")
 @click.option("-v", "--verbose", default=False, is_flag=True, help="Print text level stats")
-def cli_build(file, output, clear=False, train=0.8, dev=.0, test=0.2, verbose=False):
+def cli_build(file, output="./output", no_split=False, clear=False, train=0.8, dev=.0, test=0.2, verbose=False):
     """ Uses [FILE] to split and pre-process a training corpus for NLP Tasks. File should follow the schema, see
     protogeneia get-scheme"""
 
@@ -49,6 +50,8 @@ def cli_build(file, output, clear=False, train=0.8, dev=.0, test=0.2, verbose=Fa
             shutil.rmtree(output, ignore_errors=True)
         else:
             print("\tData were not removed")
+    if no_split:
+        train, test, dev = 1, 0, 0
     dispatch(
         config=file,
         train=train,
@@ -122,18 +125,19 @@ def dispatch(
     """
 
     train, test, dev = check_ratio(train, test, dev)
-    print(train, test, dev)
     config = ProtogenieConfiguration.from_xml(config)
+    no_split = sorted([train, test, dev]) == [0, 0, 1]
 
     os.makedirs(output_dir, exist_ok=True)
-    for subset in ["dev", "test", "train"]:
-        os.makedirs(os.path.join(output_dir, subset), exist_ok=True)
+    if not no_split:  # No split
+        for subset in ["dev", "test", "train"]:
+            os.makedirs(os.path.join(output_dir, subset), exist_ok=True)
 
     print("=============")
     print("Processing...")
     # I run over each files
     for file, ratios in split_files(output_folder=output_dir, verbose=verbose, dev_ratio=dev, test_ratio=test,
-                                    config=config):
+                                    config=config, no_split=no_split):
 
         print("{} has been transformed".format(file))
         for key, value in ratios.items():

diff --git a/protogenie/configs.py b/protogenie/configs.py
@@ -5,13 +5,13 @@
 
 from .splitters import RegExpSplitter, LineSplitter, TokenWindowSplitter, FileSplitter, _SplitterPrototype
 from .reader import Reader
-from .postprocessing import Disambiguation, ReplacementSet, Skip, PostProcessing, Clitic
+from .postprocessing import Disambiguation, ReplacementSet, Skip, PostProcessing, Clitic, Capitalize
 from .toolbox import RomanNumeral
 import datetime
 from dataclasses import dataclass
 Splitter = Type[_SplitterPrototype]
 
-PostProcessingClasses = [Disambiguation, ReplacementSet, Skip, RomanNumeral, Clitic]
+PostProcessingClasses = [Disambiguation, ReplacementSet, Skip, RomanNumeral, Clitic, Capitalize]
 
 
 @dataclass

diff --git a/protogenie/dispatch.py b/protogenie/dispatch.py
@@ -39,14 +39,15 @@ class _CorpusDispatched:
 
 def split_files(
         config: ProtogenieConfiguration, output_folder: str, dev_ratio: float, test_ratio: float,
-        verbose: bool = True):
+        verbose: bool = True, no_split: bool = False):
     """ Dispatch sentence for each file in files
 
     :param config: Configuration for PPA Splitter
     :param output_folder: Folder where the data should be saved
     :param dev_ratio: Ratio of data to put in dev
     :param test_ratio: Ratio of data to put in test
     :param verbose: Verbosity (Adds some print during process)
+    :param no_split: Do not apply splitting
 
     :yield: File, Dispatch stats about file
     """
@@ -63,7 +64,7 @@ def split_files(
            yield from _single_file_dispatch(
                file, current_config=current_config, memory=memory,
                dev_ratio=dev_ratio, test_ratio=test_ratio, output_folder=output_folder,
-               config=config, verbose=verbose
+               config=config, verbose=verbose, no_split=no_split
            )
 
     if memory:
@@ -298,7 +299,7 @@ def _preview(file: str, current_config: CorpusConfiguration) -> Tuple[List[str],
 def _single_file_dispatch(
         file: str, current_config: CorpusConfiguration,
         config: ProtogenieConfiguration, output_folder: str, dev_ratio: float, test_ratio: float,
-        verbose: bool = True, memory=None
+        verbose: bool = True, memory=None, no_split: bool = False
 ):
     # We do two passes here
     #  1. The first one is used to collect informations about the file. In order to not keep data in memory,
@@ -313,27 +314,34 @@ def _single_file_dispatch(
     header_line, unit_counts, empty_lines, lines = _preview(file, current_config)
 
     if verbose:
-        print("{unit_count} {unit_name} to dispatch in {filename} ({lines} full, {lines_empty} empty)".format(
-            filename=file, unit_name=current_config.unit_name, unit_count=unit_counts,
-            lines_empty=empty_lines, lines=lines
-        ))
+        if no_split is True:
+            print("Not splitting file {filename}")
+        else:
+            print("{unit_count} {unit_name} to dispatch in {filename} ({lines} full, {lines_empty} empty)".format(
+                filename=file, unit_name=current_config.unit_name, unit_count=unit_counts,
+                lines_empty=empty_lines, lines=lines
+            ))
 
     # We set up numbers based on the ratio
     # In order to do that, we get to use
-    target_dataset = current_config.build_dataset_dispatch_list(
-        units_count=unit_counts,
-        test_ratio=test_ratio,
-        dev_ratio=dev_ratio
-    )
+    if no_split is False:
+        target_dataset = current_config.build_dataset_dispatch_list(
+            units_count=unit_counts,
+            test_ratio=test_ratio,
+            dev_ratio=dev_ratio
+        )
 
-    # We set up a dictionary of token count to print nice
-    #  information later
-    training_tokens = {"test": 0, "dev": 0, "train": 0}
+        # We set up a dictionary of token count to print nice
+        #  information later
+        training_tokens = {"test": 0, "dev": 0, "train": 0}
+    else:
+        training_tokens = {"output": 0}
 
     # ToDo: When file splitter, the number of lines should be passed here probably ? Or is reset the issue ? ...
 
     current_config.splitter.reset()
-    current_config.splitter.set_targets(target_dataset)
+    if not no_split:
+        current_config.splitter.set_targets(target_dataset)
 
     created_files = set()
 
@@ -351,19 +359,24 @@ def _single_file_dispatch(
 
             sentence.append(line)
             if current_config.splitter(line, reader=current_config.reader):
-                dataset = target_dataset.pop(0)
+                if no_split:
+                    dataset = "output"
+                else:
+                    dataset = target_dataset.pop(0)
 
                 if memory:
                     memory.writerow([file, "{}-{}".format(line_no - len(sentence) + 1 - blanks, line_no), dataset])
                     blanks = 0
+
                 sentence = [x for x in sentence if x.strip()]
                 add_sentence(
                     output_folder=output_folder,
                     dataset=dataset,
                     filename=file,
                     sentence=sentence,
                     source_marker=current_config.column_marker,
-                    output_marker=config.output.column_marker
+                    output_marker=config.output.column_marker,
+                    subfolder=not no_split
                 )
                 training_tokens[dataset] += len(sentence)
                 sentence = []
@@ -385,14 +398,16 @@ def _single_file_dispatch(
                 filename=file,
                 sentence=sentence,
                 source_marker=current_config.column_marker,
-                output_marker=config.output.column_marker
+                output_marker=config.output.column_marker,
+                subfolder=not no_split
             )
             training_tokens[dataset] += len(sentence)
 
     created_files.update(
         _add_header(
             output_folder=output_folder, training_tokens=training_tokens, header_line=header_line,
-            current_config=current_config, file=file
+            current_config=current_config, file=file,
+            subfolder=not no_split
         )
     )
 
@@ -406,11 +421,16 @@ def _single_file_dispatch(
 
 def _add_header(output_folder: str, file: str,
                 training_tokens: Dict[str, int], current_config: CorpusConfiguration,
-                header_line: List[str]) -> Set[str]:
+                header_line: List[str],
+                subfolder: bool = True
+                ) -> Set[str]:
     files = set()
     for dataset, tokens in training_tokens.items():
         if tokens:
-            trg = get_name(output_folder, dataset, file)
+            if subfolder:
+                trg = get_name(output_folder, dataset, file)
+            else:
+                trg = get_name(output_folder, "", file)
             files.add(trg)  # We add the file to the one we created
             with open(trg) as f:
                 content = f.read()

diff --git a/protogenie/io_utils.py b/protogenie/io_utils.py
@@ -8,7 +8,8 @@ def get_name(output_folder, dataset, filename):
 
 def add_sentence(
         output_folder: str, dataset: str, filename: str,
-        sentence: List[str], source_marker: str, output_marker: str):
+        sentence: List[str], source_marker: str, output_marker: str,
+        subfolder: bool = True):
     """ Write a sentence in the given dataset
 
     :param output_folder:
@@ -17,7 +18,10 @@ def add_sentence(
     :param sentence:
     :return:
     """
-    filename = get_name(output_folder, dataset, filename)
+    if subfolder:
+        filename = get_name(output_folder, dataset, filename)
+    else:
+        filename = get_name(output_folder, "", filename)
     if not os.path.isfile(filename):
         mode = "w"
     else: