In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append('/home/lv71161/hlibbabii/projects/log-recommender-dataprep/')

import os

import dataprep.api.corpus as api
import dataprep.api.text as text_api
from metrics.vector import get_new_vocab
from dataclasses import dataclass, field
from typing import Optional, Callable, Tuple, Union, Any, List, Dict

from dataprep.api.corpus import PreprocessedCorpus
import logging
import regex
import pandas as pd

IDENTIFIER_REGEX = "([[:lower:]]|[[:upper:]]|$|_)([[:lower:]]|[[:upper:]]|[0-9]|_|$)*"

KEYWORDS = { "abstract", "assert", "boolean", "break", "byte", "case", "catch", "char",
			"class", "const", "continue", "default", "do", "double", "else", "enum", "extends", "final", "finally",
			"float", "for", "goto", "if", "implements", "import", "instanceof", "int", "interface", "long", "native",
			"new", "package", "private", "protected", "public", "return", "short", "static", "strictfp", "super",
			"switch", "synchronized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while",
			"true", "false", "null" };

def is_identifier(word: str) -> bool:
    if word not in KEYWORDS and regex.fullmatch(IDENTIFIER_REGEX, word):
        return True
    else:
        return False

def without_non_identifiers(dct: Dict[str, int]) -> Dict[str, int]:
    res = {}
    for k, v in dct.items():
        if is_identifier(k):
            res[k] = v
    return res   

#logging.disable(logging.WARNING)

@dataclass(frozen=True)
class Corpus(object):
    path: str
    extensions: str  # in format "py" or "java|c|py"
        

PrepCallable = Callable[..., PreprocessedCorpus]
ParametrizedPrepCallable = Callable[[Corpus], PreprocessedCorpus]
        
@dataclass(frozen=True)
class PrepFunction(object):
    callable: PrepCallable
    params: List = field(default_factory=list)
    options: Dict[str, Any] = field(default_factory=dict)

    @property
    def apply(self) -> ParametrizedPrepCallable:
        def prep_corpus(corpus: Corpus, **kwargs):
            return self.callable(corpus.path, *self.params, **self.options, **kwargs,
                                 calc_vocab=True, extensions=corpus.extensions)

        return prep_corpus

HOME='/home/lv71161/hlibbabii'
PATH_TO_DATASETS=os.path.join(HOME, 'raw_datasets')
datasets = [
    'allamanis/java-minus-small-test',
            'allamanis/java-small-test'
           ]


sample='''void test_WordUeberraschungPrinter() {
    if (eps >= 0.345e+4) { // FIXME 10L
        printWord("     ...     Überraschung 0x12");
    }
}'''


def print_oov_stats(train_vocab, test_vocab):
    tops = [sys.maxsize, 200000, 100000, 75000, 50000, 25000]
    for top in tops:
        test_tokens_number = sum(test_vocab.values())
        oov_in_test = get_new_vocab(list(train_vocab.items())[:top], list(test_vocab.items()))
        oov_tokens_number = sum(map(lambda e: e[1], oov_in_test))
        print(f"OOV in test set (top {top}): {len(oov_in_test)} ({100.0 * len(oov_in_test) / len(test_vocab):.2f}%), oov tokens number: {oov_tokens_number} ({100.0*oov_tokens_number/test_tokens_number:.2f}%)")


def chunks(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
        

def percentiles(vocab: Dict[str, int]) -> Tuple[List[float], List[int]]:
    N_BUCKETS = 100
    values_list = list(vocab.values())
    lsts = chunks(values_list, N_BUCKETS)
    avg = [pd.Series(sequence).mean() for sequence in lsts]
    lsts = chunks(values_list, N_BUCKETS)
    med = [sequence[len(sequence) // 2] for sequence in lsts]
    return avg, med


def which_bucket(what: int, buckets: List[int]):
    for i in range(len(buckets)):
        if (i+1 == len(buckets) or what < buckets[i+1]) and what >= buckets[i]:
            return i
    raise ValueError(f'Invalid value: {what}, buckets: {buckets}')

def frequencies(vocab: Dict[str, int], buckets: List[Tuple[int]]):
    counts = [0 for _ in buckets]
    for k, v in vocab.items():
        counts[which_bucket(v, buckets)] += 1
    return counts
    

def plot_percentiles(vocab: Dict[str, int]) -> None:
    import plotly.graph_objects as go
        
    avg, med = percentiles(vocab)
    print("Average:")
    print(avg)
    df = pd.DataFrame({'percentile': list(range(100)), 'n_tokens': avg})
    
    data = [go.Bar(x=df.percentile, y=df.n_tokens)]
    fig = go.Figure(data=data)
    fig.update_layout(yaxis_type="log")
    fig.show()
    
    print("Median:")
    print(med)
    df = pd.DataFrame({'percentile': list(range(100)), 'n_tokens': med})
    
    data = [go.Bar(x=df.percentile, y=df.n_tokens)]
    fig = go.Figure(data=data)
    fig.update_layout(yaxis_type="log")
    fig.show()

    
def get_corpus_size(vocab: Dict[str, int]) -> int:
    return sum(vocab.values())

import shutil
        
def calc_and_display_stats(prep_function, description) -> None:    
    print(f"{description}\n")
    
    vocabs = []
    prep_corpora = []
    for dataset in datasets:
        dataset_path = os.path.join(PATH_TO_DATASETS, dataset)
        prep_corpus = prep_function.apply(Corpus(dataset_path, 'java'), output_path=os.path.join(HOME, 'prep-datasets'))
        prep_corpora.append(prep_corpus)
        vocabs.append(prep_corpus.load_vocab())
    print(f'Removing prepped dataset at {prep_corpora[0].path_to_prep_dataset}')
    shutil.rmtree(prep_corpora[0].path_to_prep_dataset)
    
    train_vocab, test_vocab = tuple(vocabs)
    print("\n========================   Split example   =========================\n")
    
    text_callable = getattr(text_api, prep_function.callable.__name__)
    text_prep_func = PrepFunction(text_callable, prep_function.params, prep_function.options)
    print(text_callable(sample, *prep_function.params, **prep_function.options, extension="java"))

    print("\n========================   Stats           =========================\n")


    print(f"Train vocab: {len(train_vocab)}")
    test_vocab_identifiers = without_non_identifiers(test_vocab)
    
    print(f"Test vocab : {len(test_vocab)}, identifiers: {len(test_vocab_identifiers)} ({len(test_vocab_identifiers)/len(test_vocab):.2f}%)\n")
  
    print(f"Train corpus size: {get_corpus_size(train_vocab)}")
    test_tokens_number_ids = get_corpus_size(test_vocab_identifiers)
    test_tokens_number = get_corpus_size(test_vocab)
    print(f"Test corpus size : {test_tokens_number}, identifiers: {test_tokens_number_ids} ({test_tokens_number_ids/test_tokens_number:.2f}%)\n")
    
    print_oov_stats(train_vocab, test_vocab)
    print("\nFor identifiers: \n")
    print_oov_stats(train_vocab, test_vocab_identifiers)
    
    #plot_percentiles(train_vocab)
    buckets = [1, 2, 11, 101, 1001]
    freqs = frequencies(train_vocab, buckets)
    print(freqs)
    return 0,0 #TODO remove this


In [3]:
import dataprep
dataprep.__version__

'1.0.0-alpha.8'

In [None]:
prep_function = PrepFunction(api.nosplit)
description = "unsplit, with comments and vocabulary. No filtering."

nosplit_train_prep, nosplit_test_prep = calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True})
description = "Filtering non-ASCII tokens"

calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True})
description = "Filtering whitespace (+ non-ascii)"

calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True})
description = "Filtering comments (+ whitespace, + non-ascii)"

calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.nosplit, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "no_str": True})
description = "Filtering  strings (+ comments, + whitespace, + non-ascii)"

calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.nosplit, [], {"no_com": True, "full_strings": True, "max_str_length": 14})
description = "An additional choice, the model from Hellendoorn and Devanbu: keep strings shorter than 15 char (check SLP-core code), remove others, remove comments"

calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.basic, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions"

calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.basic, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14, "no_case": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + Word splitting via conventions, and remove case."

calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.basic, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14, "no_case": True, "split_numbers": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case.+ Split numbers"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.basic, [], {"no_unicode": True, "no_spaces": True, "no_com": True, "max_str_length": 14, "no_case": True, "split_numbers": True, "ronin": True})
description = "[Unsplit, remove non-ascii, remove whitespace, remove comments, remove strings longer than 15 chars] + split via conventions, but keep case. + Split numbers + Ronin"
calc_and_display_stats(prep_function, description)

## BPE

In [None]:
%%bash -s "{datasets[0]}"

dataprep learn-bpe 20000 -p "$1" --id "main" -e "java"

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "no_str": True})
description = "bpe 1k no strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "no_str": True})
description = "bpe 2k no strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "no_str": True})
description = "bpe 5k no strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "no_str": True})
description = "bpe 10k no strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "no_str": True})
description = "bpe 20k no strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True, "max_str_length": 14})
description = "bpe 1k Hellendoorn and Devanbu strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True, "max_str_length": 14})
description = "bpe 2k Hellendoorn and Devanbu strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True, "max_str_length": 14})
description = "bpe 5k Hellendoorn and Devanbu strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True, "max_str_length": 14})
description = "bpe 10k Hellendoorn and Devanbu strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True, "max_str_length": 14})
description = "bpe 20k Hellendoorn and Devanbu strings no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {"no_com": True})
description = "bpe 1k no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {"no_com": True})
description = "bpe 2k no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {"no_com": True})
description = "bpe 5k no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {"no_com": True})
description = "bpe 10k no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {"no_com": True})
description = "bpe 20k no comments"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-1000'], {})
description = "bpe 1k"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-2000'], {})
description = "bpe 2k"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-5000'], {})
description = "bpe 5k"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-10000'], {})
description = "bpe 10k"
calc_and_display_stats(prep_function, description)

In [None]:
prep_function = PrepFunction(api.bpe, ['java-bpe-training_nounicode-20000'], {})
description = "bpe 20k"
calc_and_display_stats(prep_function, description)