In [1]:
import dataprep.api.corpus as api
import os
from vocabstudy.common import PrepFunction, calc_and_display_stats, VocabStatsCsvWriter, HEADER, HOME
import keyword
PYTHON_KEYWORDS = keyword.kwlist

PYTHON_DATASETS = ('rafael/python-minus-test',
            'rafael/python-test'
           )

writer = VocabStatsCsvWriter(os.path.join(HOME, 'python-stats.csv'), HEADER)

def run(prep_function: PrepFunction, description: str) -> None:
    row = calc_and_display_stats(prep_function, description, PYTHON_DATASETS, PYTHON_KEYWORDS, 'py')
    writer.write_line(row)

In [2]:
import dataprep

dataprep.__version__

'1.0.0-alpha.8'

In [3]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True})
description = "unsplit, with comments and vocabulary. Filtering non-ascii."

run(prep_function, description)

unsplit, with comments and vocabulary. Filtering non-ascii.

2019-08-21 18:02:19,120 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_Uc10su/vocab
2019-08-21 18:02:19,122 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:02:19,123 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_Uc10su_-_prep
2019-08-21 18:03:02,271 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_Uc10su/vocab
2019-08-21 18:03:02,275 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:03:02,276 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-t

In [4]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True})
description = "Filtering  comments ( + whitespace, + non-ascii)"

run(prep_function, description)

Filtering  comments ( + whitespace, + non-ascii)

2019-08-21 18:03:50,470 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_U0100u/vocab
2019-08-21 18:03:50,472 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:03:50,473 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_U0100u_-_prep
2019-08-21 18:04:31,181 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_U0100u/vocab
2019-08-21 18:04:31,183 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:04:31,184 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-test_15-07-1

In [5]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "full_strings": True, "max_str_length": 14})
description = "An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments"

run(prep_function, description)

An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments

2019-08-21 18:05:17,830 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_U0EF0u/vocab
2019-08-21 18:05:17,831 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:05:17,831 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_U0EF0u_-_prep
2019-08-21 18:05:55,694 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_U0EF0u/vocab
2019-08-21 18:05:55,696 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:05:55,697 [datap

In [6]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions"

run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions

2019-08-21 18:06:40,097 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_U0E10u/vocab
2019-08-21 18:06:40,098 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:06:40,098 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_U0E10u_-_prep
2019-08-21 18:06:53,110 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_U0E10u/vocab
2019-08-21 18:06:53,112 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:06:53,113 [dataprep.api.corpus] 

In [7]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True, "split_numbers": True, "ronin": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin"
run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin

2019-08-21 18:07:07,509 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_U0E30l/vocab
2019-08-21 18:07:07,520 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:07:07,521 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_U0E30l_-_prep
2019-08-21 18:07:10,162 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_U0E30l/vocab
2019-08-21 18:07:10,164 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:0

In [8]:
prep_function = PrepFunction(api.bpe, ['python-bpe-training_nounicode-2000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 2k Hellendoorn and Devanbu strings no comments no unicode "
run(prep_function, description)

bpe 2k Hellendoorn and Devanbu strings no comments no unicode 

2019-08-21 18:07:14,624 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_U0E9su_python-bpe-training_nounicode-2000/vocab
2019-08-21 18:07:14,626 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:07:14,627 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_U0E9su_python-bpe-training_nounicode-2000_-_prep
2019-08-21 18:07:14,704 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_U0E9su_python-bpe-training_nounicode-2000/vocab
2019-08-21 18:07:14,706 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 1

In [9]:
prep_function = PrepFunction(api.bpe, ['python-bpe-training_nounicode-5000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 5k Hellendoorn and Devanbu strings no comments no unicode"
run(prep_function, description)

bpe 5k Hellendoorn and Devanbu strings no comments no unicode

2019-08-21 18:07:14,888 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_U0E9su_python-bpe-training_nounicode-5000/vocab
2019-08-21 18:07:14,911 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:07:14,912 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_U0E9su_python-bpe-training_nounicode-5000_-_prep
2019-08-21 18:07:14,998 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_U0E9su_python-bpe-training_nounicode-5000/vocab
2019-08-21 18:07:15,000 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18

In [10]:
prep_function = PrepFunction(api.bpe, ['python-bpe-training_nounicode-10000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 10k Hellendoorn and Devanbu strings no comments no unicode"
run(prep_function, description)

bpe 10k Hellendoorn and Devanbu strings no comments no unicode

2019-08-21 18:07:15,291 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-minus-test_19-08-16T13-44-40_py_-_U0E9su_python-bpe-training_nounicode-10000/vocab
2019-08-21 18:07:15,292 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-21 18:07:15,294 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/python-minus-test_19-08-16T13-44-40_py_-_U0E9su_python-bpe-training_nounicode-10000_-_prep
2019-08-21 18:07:15,458 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/python-test_15-07-11T19-08-23_py_-_U0E9su_python-bpe-training_nounicode-10000/vocab
2019-08-21 18:07:15,460 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-2