Skip to content

Commit

Permalink
Merge 33e920d into c347c0d
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Sep 17, 2020
2 parents c347c0d + 33e920d commit f5f1636
Show file tree
Hide file tree
Showing 15 changed files with 1,815 additions and 69 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ memory.csv
memory*.csv
new.yaml
tests/tests_output
tests/test_config/generated.xml

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
37 changes: 37 additions & 0 deletions DOCUMENTATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -361,3 +361,40 @@ It will produces the following output

The glue token is not applied on token, the lemma value is transfered to the previous row and the POS is lost.
`@glue_char` is used to concatenate columns such as `lemma` here,

### Capitalization

This post-processing function capitalizes (*ie.* makes the first letter of words upper-case) randomly or always first
words of chunks (*ie.* sentences) and random letters inside. It also provides an uppercase mask creation, where it
replaces uppercased letters with lowercase letters the [Neutral Chess Queen UTF-8 character](https://www.compart.com/fr/unicode/U+1FA01).

The model is the following:

```xml
<config>
<!--...-->
<postprocessing>
<capitalize column-token="token" caps-to-utf8-marker="true">
<first-word when="never">
<sentence-marker name="empty_line"/>
</first-word>
<first-letters when="ratio" ratio="0.5"/>
</capitalize>
</postprocessing>
<!--...-->
</config>
```

1. <kbd>column-token</kbd> specifies the name of the column containing the raw form of the tokens
2. (Optional) <kbd>column-lemma</kbd> does the same thing for lemma
3. <kbd>caps-to-utf8-marker</kbd> activates masking uppercased letters.
4. <kbd>first-word</kbd> is activated when <kbd>when</kbd> is set to a value between `always`, `random` and `ratio`.
1. <kbd>when="ratio"</kbd> requires a second <kbd>ratio</kbd> value which needs to be a float between .0 and 1.0 (a percentage basically)
2. <kbd>when=random</kbd> is basically a shortcut for the latter where ratio=0.5
3. To identify sentences, you need to set up <kbd>sentence-marker</kbd>
1. It can be <kbd>name="empty_line"</kbd>, in which case chunks are separated by empty line (default output)
2. It can be <kbd>name="regexp"</kbd>, in which case it takes a `@matchPattern` attribute (for regular expression)
and a column that needs to be matched in `@source`, *.ie* `<sentence-matcher name="regexp" matchPattern="[\.!?]" source="lemma" />`
5. <kbd>first-letters</kbd> works with the same when/ratio attribute than <kbd>first-word</kbd>. It applies said capitalization
to random words inside chunks.

2 changes: 1 addition & 1 deletion exemple.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="protogeneia/schema.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="protogenie/schema.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>
<config>
<output>
<header>
Expand Down
14 changes: 9 additions & 5 deletions protogenie/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ def cli_scheme(dest):
@main.command("build")
@click.argument("file", type=click.Path(exists=True, file_okay=True, dir_okay=False))
@click.option("--output", default="./output", type=str, help="Directory where the output should be built")
@click.option("-n", "--no-split", default=False, is_flag=True, help="Does not apply splitting to files")
@click.option("-c", "--clear", default=False, is_flag=True, help="Clear the output directory")
@click.option("-t", "--train", "train", default=0.8, type=float, help="Percentage of data to use for training")
@click.option("-d", "--dev", "dev", default=0., type=float, help="Percentage of data to use for dev set")
@click.option("-e", "--test", "test", default=0.2, type=float, help="Percentage of data to use for test set")
@click.option("-v", "--verbose", default=False, is_flag=True, help="Print text level stats")
def cli_build(file, output, clear=False, train=0.8, dev=.0, test=0.2, verbose=False):
def cli_build(file, output="./output", no_split=False, clear=False, train=0.8, dev=.0, test=0.2, verbose=False):
""" Uses [FILE] to split and pre-process a training corpus for NLP Tasks. File should follow the schema, see
protogeneia get-scheme"""

Expand All @@ -49,6 +50,8 @@ def cli_build(file, output, clear=False, train=0.8, dev=.0, test=0.2, verbose=Fa
shutil.rmtree(output, ignore_errors=True)
else:
print("\tData were not removed")
if no_split:
train, test, dev = 1, 0, 0
dispatch(
config=file,
train=train,
Expand Down Expand Up @@ -122,18 +125,19 @@ def dispatch(
"""

train, test, dev = check_ratio(train, test, dev)
print(train, test, dev)
config = ProtogenieConfiguration.from_xml(config)
no_split = sorted([train, test, dev]) == [0, 0, 1]

os.makedirs(output_dir, exist_ok=True)
for subset in ["dev", "test", "train"]:
os.makedirs(os.path.join(output_dir, subset), exist_ok=True)
if not no_split: # No split
for subset in ["dev", "test", "train"]:
os.makedirs(os.path.join(output_dir, subset), exist_ok=True)

print("=============")
print("Processing...")
# I run over each files
for file, ratios in split_files(output_folder=output_dir, verbose=verbose, dev_ratio=dev, test_ratio=test,
config=config):
config=config, no_split=no_split):

print("{} has been transformed".format(file))
for key, value in ratios.items():
Expand Down
4 changes: 2 additions & 2 deletions protogenie/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@

from .splitters import RegExpSplitter, LineSplitter, TokenWindowSplitter, FileSplitter, _SplitterPrototype
from .reader import Reader
from .postprocessing import Disambiguation, ReplacementSet, Skip, PostProcessing, Clitic
from .postprocessing import Disambiguation, ReplacementSet, Skip, PostProcessing, Clitic, Capitalize
from .toolbox import RomanNumeral
import datetime
from dataclasses import dataclass
Splitter = Type[_SplitterPrototype]

PostProcessingClasses = [Disambiguation, ReplacementSet, Skip, RomanNumeral, Clitic]
PostProcessingClasses = [Disambiguation, ReplacementSet, Skip, RomanNumeral, Clitic, Capitalize]


@dataclass
Expand Down
64 changes: 42 additions & 22 deletions protogenie/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ class _CorpusDispatched:

def split_files(
config: ProtogenieConfiguration, output_folder: str, dev_ratio: float, test_ratio: float,
verbose: bool = True):
verbose: bool = True, no_split: bool = False):
""" Dispatch sentence for each file in files
:param config: Configuration for PPA Splitter
:param output_folder: Folder where the data should be saved
:param dev_ratio: Ratio of data to put in dev
:param test_ratio: Ratio of data to put in test
:param verbose: Verbosity (Adds some print during process)
:param no_split: Do not apply splitting
:yield: File, Dispatch stats about file
"""
Expand All @@ -63,7 +64,7 @@ def split_files(
yield from _single_file_dispatch(
file, current_config=current_config, memory=memory,
dev_ratio=dev_ratio, test_ratio=test_ratio, output_folder=output_folder,
config=config, verbose=verbose
config=config, verbose=verbose, no_split=no_split
)

if memory:
Expand Down Expand Up @@ -298,7 +299,7 @@ def _preview(file: str, current_config: CorpusConfiguration) -> Tuple[List[str],
def _single_file_dispatch(
file: str, current_config: CorpusConfiguration,
config: ProtogenieConfiguration, output_folder: str, dev_ratio: float, test_ratio: float,
verbose: bool = True, memory=None
verbose: bool = True, memory=None, no_split: bool = False
):
# We do two passes here
# 1. The first one is used to collect informations about the file. In order to not keep data in memory,
Expand All @@ -313,27 +314,34 @@ def _single_file_dispatch(
header_line, unit_counts, empty_lines, lines = _preview(file, current_config)

if verbose:
print("{unit_count} {unit_name} to dispatch in {filename} ({lines} full, {lines_empty} empty)".format(
filename=file, unit_name=current_config.unit_name, unit_count=unit_counts,
lines_empty=empty_lines, lines=lines
))
if no_split is True:
print("Not splitting file {filename}")
else:
print("{unit_count} {unit_name} to dispatch in {filename} ({lines} full, {lines_empty} empty)".format(
filename=file, unit_name=current_config.unit_name, unit_count=unit_counts,
lines_empty=empty_lines, lines=lines
))

# We set up numbers based on the ratio
# In order to do that, we get to use
target_dataset = current_config.build_dataset_dispatch_list(
units_count=unit_counts,
test_ratio=test_ratio,
dev_ratio=dev_ratio
)
if no_split is False:
target_dataset = current_config.build_dataset_dispatch_list(
units_count=unit_counts,
test_ratio=test_ratio,
dev_ratio=dev_ratio
)

# We set up a dictionary of token count to print nice
# information later
training_tokens = {"test": 0, "dev": 0, "train": 0}
# We set up a dictionary of token count to print nice
# information later
training_tokens = {"test": 0, "dev": 0, "train": 0}
else:
training_tokens = {"output": 0}

# ToDo: When file splitter, the number of lines should be passed here probably ? Or is reset the issue ? ...

current_config.splitter.reset()
current_config.splitter.set_targets(target_dataset)
if not no_split:
current_config.splitter.set_targets(target_dataset)

created_files = set()

Expand All @@ -351,19 +359,24 @@ def _single_file_dispatch(

sentence.append(line)
if current_config.splitter(line, reader=current_config.reader):
dataset = target_dataset.pop(0)
if no_split:
dataset = "output"
else:
dataset = target_dataset.pop(0)

if memory:
memory.writerow([file, "{}-{}".format(line_no - len(sentence) + 1 - blanks, line_no), dataset])
blanks = 0

sentence = [x for x in sentence if x.strip()]
add_sentence(
output_folder=output_folder,
dataset=dataset,
filename=file,
sentence=sentence,
source_marker=current_config.column_marker,
output_marker=config.output.column_marker
output_marker=config.output.column_marker,
subfolder=not no_split
)
training_tokens[dataset] += len(sentence)
sentence = []
Expand All @@ -385,14 +398,16 @@ def _single_file_dispatch(
filename=file,
sentence=sentence,
source_marker=current_config.column_marker,
output_marker=config.output.column_marker
output_marker=config.output.column_marker,
subfolder=not no_split
)
training_tokens[dataset] += len(sentence)

created_files.update(
_add_header(
output_folder=output_folder, training_tokens=training_tokens, header_line=header_line,
current_config=current_config, file=file
current_config=current_config, file=file,
subfolder=not no_split
)
)

Expand All @@ -406,11 +421,16 @@ def _single_file_dispatch(

def _add_header(output_folder: str, file: str,
training_tokens: Dict[str, int], current_config: CorpusConfiguration,
header_line: List[str]) -> Set[str]:
header_line: List[str],
subfolder: bool = True
) -> Set[str]:
files = set()
for dataset, tokens in training_tokens.items():
if tokens:
trg = get_name(output_folder, dataset, file)
if subfolder:
trg = get_name(output_folder, dataset, file)
else:
trg = get_name(output_folder, "", file)
files.add(trg) # We add the file to the one we created
with open(trg) as f:
content = f.read()
Expand Down
8 changes: 6 additions & 2 deletions protogenie/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ def get_name(output_folder, dataset, filename):

def add_sentence(
output_folder: str, dataset: str, filename: str,
sentence: List[str], source_marker: str, output_marker: str):
sentence: List[str], source_marker: str, output_marker: str,
subfolder: bool = True):
""" Write a sentence in the given dataset
:param output_folder:
Expand All @@ -17,7 +18,10 @@ def add_sentence(
:param sentence:
:return:
"""
filename = get_name(output_folder, dataset, filename)
if subfolder:
filename = get_name(output_folder, dataset, filename)
else:
filename = get_name(output_folder, "", filename)
if not os.path.isfile(filename):
mode = "w"
else:
Expand Down

0 comments on commit f5f1636

Please sign in to comment.