In [2]:
import dataprep.api.corpus as api
from vocabstudy.common import PrepFunction, calc_and_display_stats
JAVA_KEYWORDS = {"abstract", "assert", "boolean", "break", "byte", "case", "catch", "char",
            "class", "const", "continue", "default", "do", "double", "else", "enum", "extends", "final", "finally",
            "float", "for", "goto", "if", "implements", "import", "instanceof", "int", "interface", "long", "native",
            "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp", "super",
            "switch", "synchronized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while",
            "true", "false", "null"}

JAVA_DATASETS = (
    'allamanis/java-minus-small-test',
    'allamanis/java-small-test'
)

def run(prep_function: PrepFunction, description: str) -> None:
    calc_and_display_stats(prep_function, description, JAVA_DATASETS, JAVA_KEYWORDS, 'java')

In [2]:
import dataprep

dataprep.__version__

'1.0.0-alpha.8'

In [3]:
prep_function = PrepFunction(api.nosplit)
description = "unsplit, with comments and vocabulary. No filtering."

run(prep_function, description)

unsplit, with comments and vocabulary. No filtering.

2019-08-16 12:00:41,096 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_uc10su/vocab
2019-08-16 12:00:41,117 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 12:00:41,118 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_uc10su_-_prep
2019-08-16 12:01:18,063 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_uc10su/vocab
2019-08-16 12:01:18,089 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 12:01:18,090 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161

In [5]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True})
description = "Filtering non-ASCII tokens"

run(prep_function, description)

Filtering non-ASCII tokens

2019-08-14 11:04:30,149 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_Uc10su/vocab
2019-08-14 11:04:30,177 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:04:30,179 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_Uc10su_-_prep
2019-08-14 11:05:03,034 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_Uc10su/vocab
2019-08-14 11:05:03,037 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:05:03,037 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/j

In [6]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True})
description = "Filtering whitespace (+ non-ascii)"

run(prep_function, description)

Filtering whitespace (+ non-ascii)

2019-08-14 11:07:51,950 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_Uc100u/vocab
2019-08-14 11:07:51,972 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:07:51,972 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_Uc100u_-_prep
2019-08-14 11:08:24,462 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_Uc100u/vocab
2019-08-14 11:08:24,494 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:08:24,495 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-da

In [7]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True})
description = "Filtering comments (+ whitespace, + non-ascii)"

run(prep_function, description)

Filtering comments (+ whitespace, + non-ascii)

2019-08-14 11:09:04,071 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0100u/vocab
2019-08-14 11:09:04,159 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:09:04,160 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0100u_-_prep
2019-08-14 11:09:38,090 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0100u/vocab
2019-08-14 11:09:38,245 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:09:38,246 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibb

In [4]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "no_str": True})
description = "Filtering  strings (+ comments, + whitespace, + non-ascii)"

run(prep_function, description)

Filtering  strings (+ comments, + whitespace, + non-ascii)

2019-08-16 12:01:58,564 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0000u/vocab
2019-08-16 12:01:58,621 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 12:01:58,622 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0000u_-_prep
2019-08-16 12:02:23,453 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0000u/vocab
2019-08-16 12:02:23,523 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 12:02:23,524 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/l

In [9]:
prep_function = PrepFunction(api.nosplit, [], {"no_com": True, "full_strings": True, "max_str_length": 14})
description = "An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments"

run(prep_function, description)

An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments

2019-08-14 11:11:13,725 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_u0EFsu/vocab
2019-08-14 11:11:13,738 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:11:13,738 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_u0EFsu_-_prep
2019-08-14 11:11:46,076 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_u0EFsu/vocab
2019-08-14 11:11:46,093 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2

In [10]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions"

run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions

2019-08-14 11:12:19,725 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10u/vocab
2019-08-14 11:12:19,727 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:12:19,729 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10u_-_prep
2019-08-14 11:12:25,575 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E10u/vocab
2019-08-14 11:12:25,592 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-14 11:12:

In [3]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions, and remove case."

run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions, and remove case.

2019-08-16 14:54:02,956 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10l/vocab
2019-08-16 14:54:02,967 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 14:54:02,967 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E10l_-_prep
2019-08-16 14:54:08,592 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E10l/vocab
2019-08-16 14:54:08,602 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date

In [4]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True, "split_numbers": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case.+ Split numbers"
run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case.+ Split numbers

2019-08-16 14:54:12,001 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E20l/vocab
2019-08-16 14:54:12,025 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 14:54:12,026 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E20l_-_prep
2019-08-16 14:54:14,775 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E20l/vocab
2019-08-16 14:54:14,798 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-

In [6]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True, "split_numbers": True, "ronin": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin"
run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin

2019-08-16 14:56:11,400 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_U0E30l/vocab
2019-08-16 14:56:11,401 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-16 14:56:11,401 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /home/lv71161/hlibbabii/prep-datasets/java-minus-small-test_19-08-08T23-17-22_java_-_U0E30l_-_prep
2019-08-16 14:56:12,655 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-small-test_19-02-09T13-18-23_java_-_U0E30l/vocab
2019-08-16 14:56:12,657 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed a

## BPE

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "max_str_length": 14})
description = "bpe 2k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "max_str_length": 14})
description = "bpe 5k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "max_str_length": 14})
description = "bpe 10k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "max_str_length": 14})
description = "bpe 1k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "max_str_length": 14})
description = "bpe 20k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {})
description = "bpe 2k"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {})
description = "bpe 5k"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {})
description = "bpe 10k"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {})
description = "bpe 1k"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {})
description = "bpe 20k"
run(prep_function, description)

In [7]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "no_str": True})
description = "bpe 1k no strings no comments"
run(prep_function, description)

bpe 1k no strings no comments

2019-08-16 14:56:36,553 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/java-minus-small-test_19-08-08T23-17-22_java_-_u009su_java-bpe-training_nounicode-1000/vocab
2019-08-16 14:56:36,555 [dataprep.infrastructure.stages] INFO: Parsing...


KeyboardInterrupt: 

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "no_str": True})
description = "bpe 2k no strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "no_str": True})
description = "bpe 5k no strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "no_str": True})
description = "bpe 10k no strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "no_str": True})
description = "bpe 20k no strings no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True})
description = "bpe 1k no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True})
description = "bpe 2k no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True})
description = "bpe 5k no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True})
description = "bpe 10k no comments"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True})
description = "bpe 20k no comments"
run(prep_function, description)