In [1]:
import dataprep.api.corpus as api
import os
from vocabstudy.common import PrepFunction, calc_and_display_stats, VocabStatsCsvWriter, HEADER, HOME
C_KEYWORDS = ["auto", "double", "int", "struct", "break", "else", "long", 
            "switch", "case", "enum", "register", "typedef", "char", 
            "extern", "return", "union", "continue", "for", "signed", 
            "void", "do", "if", "static", "while", "default", "goto", 
            "sizeof", "volatile", "const", "float", "short", "unsigned"]

C_DATASETS = ('rafael/c-minus-test',
            'rafael/c-test'
           )

writer = VocabStatsCsvWriter(os.path.join(HOME, 'c-stats.csv'), HEADER)

def run(prep_function: PrepFunction, description: str) -> None:
    row = calc_and_display_stats(prep_function, description, C_DATASETS, C_KEYWORDS, 'c')
    writer.write_line(row)

In [2]:
import dataprep

dataprep.__version__

'1.0.0-alpha.8'

In [3]:
!hostname

l31.vsc3plus.xcat


In [4]:
prep_function = PrepFunction(api.nosplit)
description = "unsplit, with comments and vocabulary. No filtering."

run(prep_function, description)

unsplit, with comments and vocabulary. No filtering.

2019-08-26 12:07:19,812 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-minus-test_19-08-16T14-29-19_c_-_uc10su/vocab
2019-08-26 12:07:19,814 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:07:19,815 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-minus-test_19-08-16T14-29-19_c_-_uc10su_-_prep
2019-08-26 12:07:56,309 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-test_13-07-12T06-20-12_c_-_uc10su/vocab
2019-08-26 12:07:56,313 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:07:56,313 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-test_13-07-12T06-20-12_c_-_uc10

In [5]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "no_str": True})
description = "Filtering  strings (+ comments, + whitespace, + non-ascii)"

run(prep_function, description)

Filtering  strings (+ comments, + whitespace, + non-ascii)

2019-08-26 12:08:31,510 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-minus-test_19-08-16T14-29-19_c_-_U0000u/vocab
2019-08-26 12:08:31,512 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:08:31,512 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-minus-test_19-08-16T14-29-19_c_-_U0000u_-_prep
2019-08-26 12:09:00,094 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-test_13-07-12T06-20-12_c_-_U0000u/vocab
2019-08-26 12:09:00,096 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:09:00,096 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-test_13-07-12T06-20-12_c_

In [6]:
prep_function = PrepFunction(api.nosplit, [], {"no_com": True, "full_strings": True, "max_str_length": 14})
description = "An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments"

run(prep_function, description)

An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments

2019-08-26 12:09:28,248 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-minus-test_19-08-16T14-29-19_c_-_u0EFsu/vocab
2019-08-26 12:09:28,249 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:09:28,250 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-minus-test_19-08-16T14-29-19_c_-_u0EFsu_-_prep
2019-08-26 12:10:00,609 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-test_13-07-12T06-20-12_c_-_u0EFsu/vocab
2019-08-26 12:10:00,611 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:00,611 [dataprep.api.corpus] IN

In [7]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions"

run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions

2019-08-26 12:10:32,381 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-minus-test_19-08-16T14-29-19_c_-_U0E10u/vocab
2019-08-26 12:10:32,382 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:32,383 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-minus-test_19-08-16T14-29-19_c_-_U0E10u_-_prep
2019-08-26 12:10:40,309 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-test_13-07-12T06-20-12_c_-_U0E10u/vocab
2019-08-26 12:10:40,311 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:40,312 [dataprep.api.corpus] INFO: Preprocessed

In [8]:
prep_function = PrepFunction(api.basic, [],
                             {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14,
                              "no_case": True, "split_numbers": True, "ronin": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin"
run(prep_function, description)

[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin

2019-08-26 12:10:49,763 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-minus-test_19-08-16T14-29-19_c_-_U0E30l/vocab
2019-08-26 12:10:49,765 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:49,766 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-minus-test_19-08-16T14-29-19_c_-_U0E30l_-_prep
2019-08-26 12:10:51,302 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-test_13-07-12T06-20-12_c_-_U0E30l/vocab
2019-08-26 12:10:51,303 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:51,303 [dataprep

In [9]:
prep_function = PrepFunction(api.bpe, ['c-bpe-training_nounicode-2000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 2k Hellendoorn and Devanbu strings no comments no unicode"
run(prep_function, description)

bpe 2k Hellendoorn and Devanbu strings no comments no unicode

2019-08-26 12:10:53,350 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-minus-test_19-08-16T14-29-19_c_-_U0E9su_c-bpe-training_nounicode-2000/vocab
2019-08-26 12:10:53,351 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:53,351 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-minus-test_19-08-16T14-29-19_c_-_U0E9su_c-bpe-training_nounicode-2000_-_prep
2019-08-26 12:10:53,397 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-test_13-07-12T06-20-12_c_-_U0E9su_c-bpe-training_nounicode-2000/vocab
2019-08-26 12:10:53,398 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:53,398 [dataprep.api.corpus] 

In [10]:
prep_function = PrepFunction(api.bpe, ['c-bpe-training_nounicode-5000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 5k Hellendoorn and Devanbu strings no comments no unicode "
run(prep_function, description)

bpe 5k Hellendoorn and Devanbu strings no comments no unicode 

2019-08-26 12:10:53,528 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-minus-test_19-08-16T14-29-19_c_-_U0E9su_c-bpe-training_nounicode-5000/vocab
2019-08-26 12:10:53,528 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:53,529 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-minus-test_19-08-16T14-29-19_c_-_U0E9su_c-bpe-training_nounicode-5000_-_prep
2019-08-26 12:10:53,588 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-test_13-07-12T06-20-12_c_-_U0E9su_c-bpe-training_nounicode-5000/vocab
2019-08-26 12:10:53,589 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:53,589 [dataprep.api.corpus]

In [11]:
prep_function = PrepFunction(api.bpe, ['c-bpe-training_nounicode-10000'], {"no_com": True, "max_str_length": 14, "no_unicode": True})
description = "bpe 10k Hellendoorn and Devanbu strings no comments no unicode "
run(prep_function, description)

bpe 10k Hellendoorn and Devanbu strings no comments no unicode 

2019-08-26 12:10:53,778 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-minus-test_19-08-16T14-29-19_c_-_U0E9su_c-bpe-training_nounicode-10000/vocab
2019-08-26 12:10:53,779 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:53,780 [dataprep.api.corpus] INFO: Preprocessed dataset is ready at /tmp/scratch/prep-datasets/c-minus-test_19-08-16T14-29-19_c_-_U0E9su_c-bpe-training_nounicode-10000_-_prep
2019-08-26 12:10:53,849 [dataprep.infrastructure.stages] INFO: Checking first if vocabulary file exists: /home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha.8/vocab/c-test_13-07-12T06-20-12_c_-_U0E9su_c-bpe-training_nounicode-10000/vocab
2019-08-26 12:10:53,849 [dataprep.infrastructure.stages] INFO: Vocabulary is already computed and up-to-date
2019-08-26 12:10:53,850 [dataprep.api.cor