In [2]:
import dataprep.api.corpus as api
from vocabstudy.common import PrepFunction, calc_and_display_stats, VocabStatsCsvWriter, HEADER, HOME
JAVA_KEYWORDS = {"abstract", "assert", "boolean", "break", "byte", "case", "catch", "char",
            "class", "const", "continue", "default", "do", "double", "else", "enum", "extends", "final", "finally",
            "float", "for", "goto", "if", "implements", "import", "instanceof", "int", "interface", "long", "native",
            "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp", "super",
            "switch", "synchronized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while",
            "true", "false", "null"}

JAVA_DATASETS = (
    'allamanis/java-minus-small-test',
    'allamanis/java-small-test'
)

writer = VocabStatsCsvWriter(os.path.join(HOME, 'java-stats.csv'), HEADER)

def run(prep_function: PrepFunction, description: str) -> None:
    row = calc_and_display_stats(prep_function, description, JAVA_DATASETS, JAVA_KEYWORDS, 'java')
    writer.write_line(row)

In [11]:
import dataprep

dataprep.__version__

'1.0.0-alpha.8'

## Unsplit corpus

In [10]:
prep_function = PrepFunction(api.nosplit)
description = "unsplit, with comments and strings. No filtering."

run(prep_function, description)

unsplit, with comments and vocabulary. No filtering.

2019-08-18 23:27:54,149 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_uc10su/vocab
2019-08-18 23:27:54,150 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-18 23:27:54,151 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-small-test_19-02-09T13-18-23_java_-_uc10su_-_prep
2019-08-18 23:27:55,307 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_uc10su/vocab
2019-08-18 23:27:55,308 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-18 23:27:55,309 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/p

## Filtering

In [5]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True})
description = "Filtering non-ASCII tokens"

run(prep_function, description)

Filtering non-ASCII tokens

2019-08-14 11:04:30,149 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_Uc10su/vocab
2019-08-14 11:04:30,177 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:04:30,179 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_Uc10su_-_prep
2019-08-14 11:05:03,034 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_Uc10su/vocab
2019-08-14 11:05:03,037 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:05:03,037 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/j

In [6]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True})
description = "Filtering whitespace (+ non-ascii)"

run(prep_function, description)

Filtering whitespace (+ non-ascii)

2019-08-14 11:07:51,950 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_Uc100u/vocab
2019-08-14 11:07:51,972 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:07:51,972 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_Uc100u_-_prep
2019-08-14 11:08:24,462 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_Uc100u/vocab
2019-08-14 11:08:24,494 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:08:24,495 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-da

In [7]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True})
description = "Filtering comments (+ whitespace, + non-ascii)"

run(prep_function, description)

Filtering comments (+ whitespace, + non-ascii)

2019-08-14 11:09:04,071 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0100u/vocab
2019-08-14 11:09:04,159 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:09:04,160 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0100u_-_prep
2019-08-14 11:09:38,090 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0100u/vocab
2019-08-14 11:09:38,245 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:09:38,246 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibb

In [4]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "no_str": True})
description = "Filtering  strings (+ comments, + whitespace, + non-ascii)"

run(prep_function, description)

Filtering  strings (+ comments, + whitespace, + non-ascii)

2019-08-16 12:01:58,564 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0000u/vocab
2019-08-16 12:01:58,621 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 12:01:58,622 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0000u_-_prep
2019-08-16 12:02:23,453 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0000u/vocab
2019-08-16 12:02:23,523 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 12:02:23,524 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/l

In [None]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "full_strings": True, "max_str_length": 14})
description = "An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments  (+ no spaces, + no unicode)  --> Baseline for word splitting"

run(prep_function, description)

## Word splitting

In [10]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions --> Baseline for subword splitting"

run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions

2019-08-14 11:12:19,725 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10u/vocab
2019-08-14 11:12:19,727 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:12:19,729 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10u_-_prep
2019-08-14 11:12:25,575 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E10u/vocab
2019-08-14 11:12:25,592 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:12:

In [3]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions, and remove case."

run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions, and remove case.

2019-08-16 14:54:02,956 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10l/vocab
2019-08-16 14:54:02,967 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 14:54:02,967 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10l_-_prep
2019-08-16 14:54:08,592 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E10l/vocab
2019-08-16 14:54:08,602 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date

## Subword splitting

In [4]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "split_numbers": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars + split via conventions, but keep case] + Split numbers"
run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case.+ Split numbers

2019-08-16 14:54:12,001 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E20l/vocab
2019-08-16 14:54:12,025 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 14:54:12,026 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E20l_-_prep
2019-08-16 14:54:14,775 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E20l/vocab
2019-08-16 14:54:14,798 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-

In [6]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "split_numbers": True, "ronin": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars + split via conventions, but keep case] + Split numbers + Ronin"
run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin

2019-08-16 14:56:11,400 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E30l/vocab
2019-08-16 14:56:11,401 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 14:56:11,401 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E30l_-_prep
2019-08-16 14:56:12,655 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E30l/vocab
2019-08-16 14:56:12,657 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed a

In [None]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "split_numbers": True, "ronin": True, "stemming": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars + split via conventions, but keep case] + Split numbers + Ronin + Stemming"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.char, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "Char model (remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars, keep case)"

run(prep_function, description)

## BPE

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 1k Hellendoorn and Devanbu strings no comments no unicode"
run(prep_function, description)

In [3]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 2k Hellendoorn and Devanbu strings no comments, no unicode"
run(prep_function, description)

bpe 2k Hellendoorn and Devanbu strings no comments, no unicode

2019-08-18 23:32:58,547 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E9su_java-bpe-training_nounicode-2000/vocab
2019-08-18 23:32:58,547 [dataprep.infrastructure.stages] INFO: Parsing...
2019-08-18 23:32:58,664 [dataprep.infrastructure.stages] INFO: Parsed dataset is up-to-date.
2019-08-18 23:32:58,664 [dataprep.infrastructure.stages] INFO: Preprocessing...
2019-08-18 23:32:58,665 [dataprep.to_repr] INFO: Reading parsed files from: /home/lv71161/hlibbabii/.cache/dataprep/1.0.0-alpha.8/parsed_datasets/java-small-test_19-02-09T13-18-23_java
2019-08-18 23:32:58,666 [dataprep.to_repr] INFO: Using bpe merges file: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/bpe/java-bpe-training_19-08-01T10-29-09_java_-_nounicode/2000/merges.txt
2019-08-18 23:32:58,693 [dataprep.to_repr] INFO: Us

100%|██████████| 8268/8268 [01:15<00:00, 109.63it/s]


2019-08-18 23:34:14,856 [dataprep.infrastructure.stages] INFO: Computing vocab...
2019-08-18 23:34:14,858 [dataprep.vocab] DEBUG: Reading files from: /tmp/scratch/prep-datasets/java-small-test_19-02-09T13-18-23_java_-_U0E9su_java-bpe-training_nounicode-2000_-_prep
2019-08-18 23:34:14,859 [dataprep.vocab] INFO: Calculating vocabulary from scratch


100%|██████████| 5120/5120 [00:36<00:00,  3.08it/s] 

2019-08-18 23:34:51,941 [dataprep.vocab] DEBUG: Using 32 mergers, number of partial vocabs: 5120
2019-08-18 23:34:52,131 [dataprep.vocab] DEBUG: Merges need to be done: 5100





2019-08-18 23:34:52,267 [dataprep.vocab] INFO: [1] Merging vocabs (0 out of 5100)
2019-08-18 23:34:52,275 [dataprep.vocab] INFO: [2] Merging vocabs (1 out of 5100)
2019-08-18 23:34:52,279 [dataprep.vocab] DEBUG: [1] New words: ['com</t>', 'te', 'am', '11', '6', '0</t>', 'sc', 'out', 'ing</t>', 'f'] ..., total: 173
2019-08-18 23:34:52,280 [dataprep.vocab] DEBUG: [1] Merging took 0.012 s, current vocab size: 374
2019-08-18 23:34:52,282 [dataprep.vocab] INFO: [3] Merging vocabs (2 out of 5100)
2019-08-18 23:34:52,287 [dataprep.vocab] DEBUG: [2] New words: ['net</t>', 'g', 'tools</t>', 'gr', 'and</t>', 'ui</t>', 'raph</t>', 'draw', '2', 'd</t>'] ..., total: 410
2019-08-18 23:34:52,288 [dataprep.vocab] INFO: [1] Merging vocabs (4 out of 5100)
2019-08-18 23:34:52,289 [dataprep.vocab] DEBUG: [2] Merging took 0.013 s, current vocab size: 578
2019-08-18 23:34:52,309 [dataprep.vocab] INFO: [6] Merging vocabs (8 out of 5100)
2019-08-18 23:34:52,363 [dataprep.vocab] INFO: [14] Merging vocabs (16 o

2019-08-18 23:35:07,593 [dataprep.vocab] DEBUG: [28] Merging took 0.026 s, current vocab size: 1733
2019-08-18 23:35:07,592 [dataprep.vocab] DEBUG: [29] No vocabs available for merge. Terminating process..., mergers left: 27
2019-08-18 23:35:07,595 [dataprep.vocab] DEBUG: [28] No vocabs available for merge. Terminating process..., mergers left: 24
2019-08-18 23:35:07,595 [dataprep.vocab] DEBUG: [3] No vocabs available for merge. Terminating process..., mergers left: 23
2019-08-18 23:35:07,594 [dataprep.vocab] DEBUG: [19] No vocabs available for merge. Terminating process..., mergers left: 25
2019-08-18 23:35:07,595 [dataprep.vocab] DEBUG: [15] No vocabs available for merge. Terminating process..., mergers left: 26
2019-08-18 23:35:07,603 [dataprep.vocab] INFO: [25] Merging vocabs (5084 out of 5100)
2019-08-18 23:35:07,602 [dataprep.vocab] DEBUG: [18] No vocabs available for merge. Terminating process..., mergers left: 22
2019-08-18 23:35:07,603 [dataprep.vocab] DEBUG: [4] No vocabs ava

2019-08-18 23:35:08,612 [dataprep.vocab] DEBUG: [13] New words: ['wn', "'\\\\u"] ..., total: 2
2019-08-18 23:35:08,613 [dataprep.vocab] DEBUG: [13] Merging took 0.090 s, current vocab size: 2101
2019-08-18 23:35:08,615 [dataprep.vocab] INFO: 55% + 5%  ---> 60%
2019-08-18 23:35:08,715 [dataprep.vocab] DEBUG: [13] New words: ['alue</t>', 'Meta', 'oi'] ..., total: 3
2019-08-18 23:35:08,717 [dataprep.vocab] DEBUG: [13] Merging took 0.099 s, current vocab size: 2104
2019-08-18 23:35:08,719 [dataprep.vocab] INFO: 60% + 5%  ---> 65%
2019-08-18 23:35:08,839 [dataprep.vocab] DEBUG: [13] New words: [] ..., total: 0
2019-08-18 23:35:08,840 [dataprep.vocab] DEBUG: [13] Merging took 0.117 s, current vocab size: 2104
2019-08-18 23:35:08,842 [dataprep.vocab] INFO: 65% + 5%  ---> 70%
2019-08-18 23:35:08,964 [dataprep.vocab] DEBUG: [13] New words: ['rst', 'Enti'] ..., total: 2
2019-08-18 23:35:08,965 [dataprep.vocab] DEBUG: [13] Merging took 0.120 s, current vocab size: 2106
2019-08-18 23:35:08,966 [da

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 5k Hellendoorn and Devanbu strings no comments, no unicode"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 10k Hellendoorn and Devanbu strings no comments, no unicode"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 20k Hellendoorn and Devanbu strings no comments no unicode"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_unicode": True})
description = "bpe 1k no unicode"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_unicode": True})
description = "bpe 2k, no unicode "
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_unicode": True})
description = "bpe 5k, no unicode"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_unicode": True})
description = "bpe 10k, no unicode"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_unicode": True})
description = "bpe 20k nounicode"
run(prep_function, description)

In [7]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "no_str": True, "no_unicode": True})
description = "bpe 1k no strings no comments no unicode"
run(prep_function, description)

bpe 1k no strings no comments

2019-08-16 14:56:36,553 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_u009su_java-bpe-training_nounicode-1000/vocab
2019-08-16 14:56:36,555 [dataprep.infrastructure.stages] INFO: Parsing...


KeyboardInterrupt: 

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "no_str": True, "no_unicode": True})
description = "bpe 2k no strings no comments no unicode"
, "no_unicode": Truerun(prep_function, description)

In [None]:
, "no_unicode": Trueprep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "no_str": True, "no_unicode": True})
description = "bpe 5k no strings no comments no unicode"
run(prep_function, description)

In [None]:
, "no_unicode": Trueprep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "no_str": True, "no_unicode": True})
description = "bpe 10k no strings no comments no unicode"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "no_str": True, "no_unicode": True})
description = "bpe 20k no strings no comments no unicode"
run(prep_function, description)

In [None]:
, "no_unicode": Trueprep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "no_unicode": True})
description = "bpe 1k no comments no unicode"
run(prep_function, description)

In [None]:
, "no_unicode": Trueprep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "no_unicode": True})
description = "bpe 2k no comments no unicode"
run(prep_function, description)

In [None]:
, "no_unicode": Trueprep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "no_unicode": True})
description = "bpe 5k no comments no unicode"
run(prep_function, description)

In [None]:
, "no_unicode": Trueprep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "no_unicode": True})
description = "bpe 10k no comments no unicode"
run(prep_function, description)

In [None]:
, "no_unicode": Trueprep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "no_unicode": True})
description = "bpe 20k no comments no unicode"
run(prep_function, description)