In [1]:
import dataprep.api.corpus as api
import os

from vocabstudy.common import PrepFunction, calc_and_display_stats, VocabStatsCsvWriter, HEADER, HOME
JAVA_KEYWORDS = {"abstract", "assert", "boolean", "break", "byte", "case", "catch", "char",
            "class", "const", "continue", "default", "do", "double", "else", "enum", "extends", "final", "finally",
            "float", "for", "goto", "if", "implements", "import", "instanceof", "int", "interface", "long", "native",
            "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp", "super",
            "switch", "synchronized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while",
            "true", "false", "null"}

JAVA_DATASETS = (
    'allamanis/java-minus-small-test',
    'allamanis/java-small-test'
)

writer = VocabStatsCsvWriter(os.path.join(HOME, 'java-stats.csv'), HEADER)

def run(prep_function: PrepFunction, description: str) -> None:
    row = calc_and_display_stats(prep_function, description, JAVA_DATASETS, JAVA_KEYWORDS, 'java')
    writer.write_line(row)

In [2]:
import dataprep

dataprep.__version__

'1.0.0-alpha.8'

## Unsplit corpus

In [3]:
prep_function = PrepFunction(api.nosplit)
description = "unsplit, with comments and strings. No filtering."
run(prep_function, description)


unsplit, with comments and strings. No filtering.

2019-08-26 12:22:55,429 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_uc10su/vocab
2019-08-26 12:22:55,431 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:22:55,432 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_uc10su_-_prep
2019-08-26 12:23:28,538 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_uc10su/vocab
2019-08-26 12:23:28,541 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:23:28,542 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/

## Filtering

In [4]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True})
description = "Filtering non-ASCII tokens"

run(prep_function, description)

Filtering non-ASCII tokens

2019-08-26 12:23:57,326 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_Uc10su/vocab
2019-08-26 12:23:57,327 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:23:57,328 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_Uc10su_-_prep
2019-08-26 12:24:31,352 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_Uc10su/vocab
2019-08-26 12:24:31,356 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:24:31,357 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-small-test_19-02-0

In [5]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True})
description = "Filtering whitespace (+ non-ascii)"

run(prep_function, description)

Filtering whitespace (+ non-ascii)

2019-08-26 12:24:59,856 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_Uc100u/vocab
2019-08-26 12:24:59,857 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:24:59,857 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_Uc100u_-_prep
2019-08-26 12:25:32,371 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_Uc100u/vocab
2019-08-26 12:25:32,380 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:25:32,381 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-small-test

In [6]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True})
description = "Filtering comments (+ whitespace, + non-ascii)"

run(prep_function, description)

Filtering comments (+ whitespace, + non-ascii)

2019-08-26 12:25:59,219 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0100u/vocab
2019-08-26 12:25:59,231 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:25:59,232 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0100u_-_prep
2019-08-26 12:26:29,468 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0100u/vocab
2019-08-26 12:26:29,495 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:26:29,496 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/jav

In [7]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "no_str": True})
description = "Filtering  strings (+ comments, + whitespace, + non-ascii)"

run(prep_function, description)

Filtering  strings (+ comments, + whitespace, + non-ascii)

2019-08-26 12:26:55,624 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0000u/vocab
2019-08-26 12:26:55,636 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:26:55,637 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0000u_-_prep
2019-08-26 12:27:24,894 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0000u/vocab
2019-08-26 12:27:24,896 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:27:24,898 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-

In [8]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "full_strings": True, "max_str_length": 14})
description = "An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments  (+ no spaces, + no unicode)  --> Baseline for word splitting"

run(prep_function, description)

An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments  (+ no spaces, + no unicode)  --> Baseline for word splitting

2019-08-26 12:27:49,308 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0EF0u/vocab
2019-08-26 12:27:49,309 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:27:49,310 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0EF0u_-_prep
2019-08-26 12:28:19,824 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0EF0u/vocab
2019-08-26 12:28:19,859 [dataprep.infrastructure.stages] IN

## Word splitting

In [9]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions --> Baseline for subword splitting"

run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions --> Baseline for subword splitting

2019-08-26 12:28:47,508 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10u/vocab
2019-08-26 12:28:47,509 [dataprep.infrastructure.stages] INFO: Parsing...


KeyboardInterrupt: 

In [10]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions, and remove case."

run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions, and remove case.

2019-08-26 12:29:58,400 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10l/vocab
2019-08-26 12:29:58,401 [dataprep.infrastructure.stages] INFO: Parsing...


KeyboardInterrupt: 

## Subword splitting

In [None]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "split_numbers": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars + split via conventions, but keep case] + Split numbers"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "split_numbers": True, "ronin": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars + split via conventions, but keep case] + Split numbers + Ronin"
run(prep_function, description)

In [12]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "split_numbers": True, "ronin": True, "stem": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars + split via conventions, but keep case] + Split numbers + Ronin + Stemming"
run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars + split via conventions, but keep case] + Split numbers + Ronin + Stemming

2019-08-26 12:30:20,963 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0Es0u/vocab
2019-08-26 12:30:20,972 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:30:20,972 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0Es0u_-_prep
2019-08-26 12:30:22,305 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0Es0u/vocab
2019-08-26 12:30:22,306 [dataprep.infrastructure.stages] INFO: Parsing...
2019-08-26 12:30:24,22

 78%|███████▊  | 6409/8268 [00:33<00:19, 96.39it/s] Process ForkPoolWorker-13:
Process ForkPoolWorker-20:
Process ForkPoolWorker-31:
Process ForkPoolWorker-32:
Process ForkPoolWorker-4:
Process ForkPoolWorker-6:
Process ForkPoolWorker-24:
Process ForkPoolWorker-29:
Process ForkPoolWorker-8:
Process ForkPoolWorker-3:
Process ForkPoolWorker-22:
Process ForkPoolWorker-21:
Process ForkPoolWorker-19:
Process ForkPoolWorker-16:
Process ForkPoolWorker-18:
Process ForkPoolWorker-17:
Process ForkPoolWorker-11:
Process ForkPoolWorker-23:
Process ForkPoolWorker-14:
Process ForkPoolWorker-25:
Process ForkPoolWorker-28:
Process ForkPoolWorker-27:
Process ForkPoolWorker-15:
Process ForkPoolWorker-12:
Process ForkPoolWorker-26:
Process ForkPoolWorker-7:
Process ForkPoolWorker-10:
Process ForkPoolWorker-2:
Process ForkPoolWorker-1:
Process ForkPoolWorker-5:
Process ForkPoolWorker-30:
Process ForkPoolWorker-9:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/lv71161/h

KeyboardInterrupt: 

In [11]:
prep_function = PrepFunction(api.chars, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "Char model (remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars, keep case)"

run(prep_function, description)

Char model (remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars, keep case)

2019-08-26 12:30:18,356 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E80u/vocab
2019-08-26 12:30:18,365 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:30:18,366 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E80u_-_prep
2019-08-26 12:30:18,568 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E80u/vocab
2019-08-26 12:30:18,575 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:30:18,575 [dataprep.api.corpus] IN

## BPE (no unicode, no whitespace)

In [13]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "max_str_length": 14, "no_unicode": True, "no_spaces": True})
description = "bpe 1k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

bpe 1k Hellendoorn and Devanbu strings no comments

2019-08-26 12:31:03,182 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-1000/vocab
2019-08-26 12:31:03,183 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:31:03,184 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-1000_-_prep
2019-08-26 12:31:03,267 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E90u_java-bpe-training_nounicode-1000/vocab
2019-08-26 12:31:03,274 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 1

In [14]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "max_str_length": 14, "no_unicode": True, "no_spaces": True})
description = "bpe 2k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

bpe 2k Hellendoorn and Devanbu strings no comments

2019-08-26 12:31:10,276 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-2000/vocab
2019-08-26 12:31:10,284 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:31:10,285 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-2000_-_prep
2019-08-26 12:31:10,421 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E90u_java-bpe-training_nounicode-2000/vocab
2019-08-26 12:31:10,440 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 1

 78%|███████▊  | 6409/8268 [00:49<00:19, 96.39it/s]

In [15]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "max_str_length": 14, "no_unicode": True, "no_spaces": True})
description = "bpe 5k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

bpe 5k Hellendoorn and Devanbu strings no comments

2019-08-26 12:32:38,301 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-5000/vocab
2019-08-26 12:32:38,316 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:32:38,317 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-5000_-_prep
2019-08-26 12:32:38,526 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E90u_java-bpe-training_nounicode-5000/vocab
2019-08-26 12:32:38,528 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 1

In [16]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "max_str_length": 14, "no_unicode": True, "no_spaces": True})
description = "bpe 10k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

bpe 10k Hellendoorn and Devanbu strings no comments

2019-08-26 12:32:45,196 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-10000/vocab
2019-08-26 12:32:45,198 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:32:45,199 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-10000_-_prep
2019-08-26 12:32:45,360 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E90u_java-bpe-training_nounicode-10000/vocab
2019-08-26 12:32:45,377 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-

In [17]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "max_str_length": 14, "no_unicode": True, "no_spaces": True})
description = "bpe 20k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

bpe 20k Hellendoorn and Devanbu strings no comments

2019-08-26 12:32:54,178 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-20000/vocab
2019-08-26 12:32:54,204 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:32:54,205 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E90u_java-bpe-training_nounicode-20000_-_prep
2019-08-26 12:32:54,378 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E90u_java-bpe-training_nounicode-20000/vocab
2019-08-26 12:32:54,413 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_unicode": True, 'no_spaces': True})
description = "bpe 1k"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_unicode": True, 'no_spaces': True})
description = "bpe 2k"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_unicode": True, 'no_spaces': True})
description = "bpe 5k"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_unicode": True, 'no_spaces': True})
description = "bpe 10k"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_unicode": True, 'no_spaces': True})
description = "bpe 20k"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "no_str": True, "no_unicode": True, "no_spaces": True})
description = "bpe 1k no strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "no_str": True, "no_unicode": True, "no_spaces": True})
description = "bpe 2k no strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "no_str": True, "no_unicode": True, "no_spaces": True})
description = "bpe 5k no strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "no_str": True, "no_unicode": True, "no_spaces": True})
description = "bpe 10k no strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "no_str": True, "no_unicode": True, "no_spaces": True})
description = "bpe 20k no strings no comments"
run(prep_function, description)