In [None]:
import dataprep.api.corpus as api
from vocabstudy.common import PrepFunction, calc_and_display_stats
JAVA_KEYWORDS = {"abstract", "assert", "boolean", "break", "byte", "case", "catch", "char",
            "class", "const", "continue", "default", "do", "double", "else", "enum", "extends", "final", "finally",
            "float", "for", "goto", "if", "implements", "import", "instanceof", "int", "interface", "long", "native",
            "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp", "super",
            "switch", "synchronized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while",
            "true", "false", "null"}

JAVA_DATASETS = (
    'allamanis/java-minus-small-test',
    'allamanis/java-small-test'
)

def run(prep_function: PrepFunction, description: str) -> None:
    calc_and_display_stats(prep_function, description, JAVA_DATASETS, JAVA_KEYWORDS, 'java')
# %%

import dataprep

dataprep.__version__

# %%

prep_function = PrepFunction(api.nosplit)
description = "unsplit, with comments and vocabulary. No filtering."

run(prep_function, description)

# %%

prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True})
description = "Filtering non-ASCII tokens"

run(prep_function, description)

# %%

prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True})
description = "Filtering whitespace (+ non-ascii)"

run(prep_function, description)

# %%

prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True})
description = "Filtering comments (+ whitespace, + non-ascii)"

run(prep_function, description)

# %%

prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "no_str": True})
description = "Filtering  strings (+ comments, + whitespace, + non-ascii)"

run(prep_function, description)

# %%

prep_function = PrepFunction(api.nosplit, [], {"no_com": True, "full_strings": True, "max_str_length": 14})
description = "An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments"

run(prep_function, description)

# %%

prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions"

run(prep_function, description)

# %%

prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions, and remove case."

run(prep_function, description)

# %%

prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True, "split_numbers": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case.+ Split numbers"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True, "split_numbers": True, "ronin": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin"
run(prep_function, description)

# %% md

## BPE

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "no_str": True})
description = "bpe 1k no strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "no_str": True})
description = "bpe 2k no strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "no_str": True})
description = "bpe 5k no strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "no_str": True})
description = "bpe 10k no strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "no_str": True})
description = "bpe 20k no strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "max_str_length": 14})
description = "bpe 1k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "max_str_length": 14})
description = "bpe 2k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "max_str_length": 14})
description = "bpe 5k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "max_str_length": 14})
description = "bpe 10k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "max_str_length": 14})
description = "bpe 20k Hellendoorn and Devanbu strings no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True})
description = "bpe 1k no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True})
description = "bpe 2k no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True})
description = "bpe 5k no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True})
description = "bpe 10k no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True})
description = "bpe 20k no comments"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {})
description = "bpe 1k"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {})
description = "bpe 2k"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {})
description = "bpe 5k"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {})
description = "bpe 10k"
run(prep_function, description)

# %%

prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {})
description = "bpe 20k"
run(prep_function, description)