In [1]:
import dataprep.api.corpus as api
import os
from vocabstudy.common import PrepFunction, calc_and_display_stats, VocabStatsCsvWriter, HEADER, HOME
import keyword
PYTHON_KEYWORDS = keyword.kwlist

PYTHON_DATASETS = ('rafael/python-minus-test',
            'rafael/python-test'
           )

writer = VocabStatsCsvWriter(os.path.join(HOME, 'python-stats.csv'), HEADER)

def run(prep_function: PrepFunction, description: str) -> None:
    row = calc_and_display_stats(prep_function, description, PYTHON_DATASETS, PYTHON_KEYWORDS, 'py')
    writer.write_line(row)

In [2]:
import dataprep

dataprep.__version__

'1.0.0-alpha.8'

In [3]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True})
description = "unsplit, with comments and vocabulary. Filtering non-ascii."

run(prep_function, description)

unsplit, with comments and vocabulary. Filtering non-ascii.

2019-08-19 00:26:42,684 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_Uc10su/vocab
2019-08-19 00:26:42,686 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-19 00:26:42,688 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_Uc10su_-_prep
2019-08-19 00:27:22,697 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_Uc10su/vocab
2019-08-19 00:27:22,700 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-19 00:27:22,701 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-t

In [4]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True})
description = "Filtering  comments ( + whitespace, + non-ascii)"

run(prep_function, description)

Filtering  comments ( + whitespace, + non-ascii)

2019-08-19 00:28:09,552 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_U0100u/vocab
2019-08-19 00:28:09,553 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-19 00:28:09,554 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_U0100u_-_prep
2019-08-19 00:28:48,016 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_U0100u/vocab
2019-08-19 00:28:48,018 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-19 00:28:48,018 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-test_15-07-1

In [None]:
prep_function = PrepFunction(api.nosplit, [], {"no_com": True, "full_strings": True, "max_str_length": 14})
description = "An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments"

run(prep_function, description)

An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments

2019-08-19 00:29:40,522 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_u0EFsu/vocab
2019-08-19 00:29:40,523 [dataprep.infrastructure.stages] INFO: Parsing...
2019-08-19 00:29:40,567 [dataprep.infrastructure.stages] INFO: Parsed dataset is up-to-date.
2019-08-19 00:29:40,568 [dataprep.infrastructure.stages] INFO: Preprocessing...
2019-08-19 00:29:40,569 [dataprep.to_repr] INFO: Reading parsed files from: /home/lv71161/hlibbabii/.cache/dataprep/1.0.0-alpha.8/parsed_datasets/python-minus-test_19-08-16T13-44-40_py
2019-08-19 00:29:40,570 [dataprep.to_repr] INFO: Writing preprocessed files to /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_u0EFsu_-_prep


 11%|█         | 143569/1327894 [28:38<5:26:46, 60.40it/s] 

In [None]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions"

run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True, "split_numbers": True, "ronin": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['python-bpe-training_nounicode-2000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 2k Hellendoorn and Devanbu strings no comments no unicode "
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['python-bpe-training_nounicode-5000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 5k Hellendoorn and Devanbu strings no comments no unicode"
run(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['python-bpe-training_nounicode-10000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 10k Hellendoorn and Devanbu strings no comments no unicode"
run(prep_function, description)