In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#!sudo apt install erlang

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  erlang-asn1 erlang-base erlang-common-test erlang-crypto erlang-debugger
  erlang-dev erlang-dialyzer erlang-diameter erlang-edoc erlang-eldap
  erlang-erl-docgen erlang-et erlang-eunit erlang-examples erlang-ftp
  erlang-inets erlang-jinterface erlang-megaco erlang-mnesia erlang-mode
  erlang-observer erlang-odbc erlang-os-mon erlang-parsetools
  erlang-public-key erlang-reltool erlang-runtime-tools erlang-snmp erlang-src
  erlang-ssh erlang-ssl erlang-syntax-tools erlang-tftp erlang-tools erlang-wx
  erlang-xmerl javascript-common libjs-jquery libjs-jquery-metadata
  libjs-jquery-tablesorter libsctp1 libwxbase3.0-0v5 libwxgtk3.0-gtk3-0v5
Suggested packages:
  erlang-manpages erlang-doc xsltproc fop apache2 | lighttpd | httpd
  lksctp-tools
The following NEW packages will be installed:
  erlang erlang-asn1 erlang-base erlang-common-test

In [None]:
!erl -noshell -s c c drive/MyDrive/harp/tok -s init stop # Make sure to adjust file paths
!erl -noshell -s c c drive/MyDrive/harp/pprint -s init stop # Make sure to adjust file paths

In [None]:
from typing import Any, Mapping

In [None]:
# Make sure to adjust file paths
data_path        = 'drive/MyDrive/harp/dataset'
parameter_path   = 'drive/MyDrive/harp/parameters'
saved_model_path = 'drive/MyDrive/harp/saved_models'
tmp_path         = 'drive/MyDrive/harp/dataset/tmp_files'

In [None]:
from itertools import chain
def flatten_last_dim(List):
    return list(chain(*List))

In [None]:
def counter_to_list(cnt):
    return [key for key, _ in cnt.most_common()]

def reverse_dict(d):
    return {v: k for k, v in d.items()}

In [None]:
import csv
from ast import literal_eval

def try_literal_eval(val : str, try_eval = True) -> Any:
    if not try_eval:
        return val
    assert isinstance(val,str)
    try:
        return literal_eval(val)
    except ValueError:
        return val

def dict_to_csv(csv_path, vars):
    with open(csv_path, 'w') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter='~', quoting=csv.QUOTE_ALL)
        for var_name, var_value in vars.items():
            csv_writer.writerow([var_name, var_value])

def var_to_csv(csv_path, var_name, var_value):
    with open(csv_path, 'w') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter='~', quoting=csv.QUOTE_ALL)
        csv_writer.writerow([var_name, var_value])

def csv_to_dict(csv_path : str, eval_key : bool = True, eval_value : bool = True) -> Mapping:
    with open(csv_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='~', quoting=csv.QUOTE_ALL)
        d = {}
        for row in csv_reader:
            key = try_literal_eval(row[0], eval_key)
            val = try_literal_eval(row[1], eval_value)
            d[key] = val
    return d

def csv_to_var(csv_path, var_name):
    with open(csv_path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='~')
        for row in csv_reader:
            if row[0] == var_name:
                return row[1]

In [None]:
from random import random
def chance(probability = 50) -> bool:
    return random()*100 <= probability

In [None]:
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

class AttentionLayer(Layer):
    def compute_mask(self, inputs, mask=None):
        if mask is None:
            return None
        return mask[1]

    def compute_output_shape(self, input_shape):
        return input_shape[1][0], input_shape[1][1], input_shape[1][2] * 2

    def call(self, inputs, mask=None):
        encoder_outputs, decoder_outputs = inputs

        decoder_outputs_t = K.permute_dimensions(decoder_outputs, (0, 2, 1))
        luong_score = K.batch_dot(encoder_outputs, decoder_outputs_t)
        luong_score = K.softmax(luong_score, axis=1)

        encoder_vector = K.expand_dims(encoder_outputs, axis=2) * K.expand_dims(luong_score, axis=3)
        encoder_vector = K.sum(encoder_vector, axis=1)

        new_decoder_outputs = K.concatenate([decoder_outputs, encoder_vector])
        return new_decoder_outputs

In [None]:
import os
import subprocess

def erl_pretty_printer(source_codes):
    if not source_codes:
        return []

    file_names = []
    to_keep = []
    for i, source_code in enumerate(source_codes):
        if '~' in source_code:
            to_keep.append(i)
        source_code = source_code.replace('~','')
        file_content = (
'''-module(file).
-export([qwertzuiopasdfghjklyxcvbnm/0]).

qwertzuiopasdfghjklyxcvbnm() -> ''')
        file_content += source_code.split('->')[0] + '.\n'
        file_content += source_code
        file_name = os.path.join(tmp_path, f'ppfile{i}.erl')
        file_names.append(file_name)
        with open(file_name,'w') as f:
            f.write(file_content)

    assert len(file_names) == len(source_codes)
    file_names = [ '"{0}"'.format(fn) for fn in file_names ]
    assert len(file_names) == len(source_codes)
    outputs = []

    test_set_len = len(file_names)
    batch_size = 100
    fits_into_batches = test_set_len % batch_size == 0
    for i in range(test_set_len//batch_size + (0 if fits_into_batches else 1)):
        command = "erl -noshell -s pprint pretty_print {0} -s init stop".format(' '.join(file_names[i*batch_size:(i+1)*batch_size]))
        #command = "erl -noshell -s pprint pretty_print -s init stop"
        res_subprocess = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
        result = res_subprocess.stdout.read()
        erl_outputs = result.decode('utf-8')
        erl_outputs = erl_outputs.split('~~~')
        erl_outputs = ['\n'.join(erl_output.splitlines()[6:]) for erl_output in erl_outputs]
        outputs += erl_outputs

    for i in to_keep:
        outputs[i] = source_codes[i]

    return outputs

In [None]:
#x = erl_pretty_printer(['htmlise(C) -> iolist_to_binary(["<dl class=\"request\">", io_lib or format("<dt>other http headers</dt><dd>~s</dd>", [htmlise_data("http_headers", ewgi_api) -> get_all_headers(C))]), io_lib or format("<dt>ewgi extra data</dt><dd>s</dd>", [htmlise_data("request_data", ewgi_api) get_all_data(C))]), "</dl>"]).', 'wrong2(1) -> array /= new(42)) of true -> structure_is_exposed; false -> cannot_possibly_be end.'])
#for e in x:
#    print(e)
#    print('#'*100)

htmlise(C) -> iolist_to_binary(["<dl class="request">", io_lib or format("<dt>other http headers</dt><dd>~s</dd>", [htmlise_data("http_headers", ewgi_api) -> get_all_headers(C))]), io_lib or format("<dt>ewgi extra data</dt><dd>s</dd>", [htmlise_data("request_data", ewgi_api) get_all_data(C))]), "</dl>"]).
####################################################################################################
wrong2 ( 1 ) -> array /= new ( 42 ) ) of true -> structure_is_exposed ; false -> cannot_possibly_be end .
####################################################################################################


In [None]:
from collections import deque

In [None]:
def tokenize_chunk(file_names, include_line_num = False):
    file_names = [ "'{0}'".format(fn) for fn in file_names ]
    #print(file_names)
    command = "erl -noshell -s tok tokenize_files {0} -s init stop".format(' '.join(file_names))

    res_subprocess = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    result = res_subprocess.stdout.read()
    erl_scan_outputs = result.decode('utf-8')

    token_seqs = []
    for erl_scan_output in erl_scan_outputs.split('~~~'):

        erl_scan_output = erl_scan_output.replace('\n ',';')[1:-2]
        erl_scan_output = erl_scan_output.replace("','",'comma')
        erl_scan_output = erl_scan_output.split('},;')
        erl_scan_output = list(map(lambda x:x[1:],erl_scan_output))
        token_seq = []
        for token_info in erl_scan_output:
            cat_or_tok, line_num = token_info.split(',')[0:2]
            optional_third = token_info[len(cat_or_tok)+1+len(line_num)+1:]
            if cat_or_tok.startswith("'"): cat_or_tok=cat_or_tok[1:]
            if cat_or_tok.endswith  ("'"): cat_or_tok=cat_or_tok[:-1]

            if cat_or_tok != 'atom':
                if optional_third.startswith("'"): optional_third=optional_third[1:]
                if optional_third.endswith  ("'"): optional_third=optional_third[:-1]

            to_append = []
            if optional_third:
                to_append = [optional_third, cat_or_tok]
                #token_seq.append((optional_third, cat_or_tok))
            else:
                to_append = [cat_or_tok, None]
                #token_seq.append((cat_or_tok, None))
            if include_line_num:
                to_append.append(int(line_num)-1)
            token_seq.append(tuple(to_append))
        token_seqs.append(token_seq)
    return token_seqs

In [None]:
def tokenize_files(file_names, include_line_num = False, chunk_size=100, return_list : bool = False):
    token_seqs = deque() if not return_list else []
    for i in range(0,len(file_names),chunk_size):
        file_names_chunk = file_names[i:i+chunk_size]
        tokenized_chunk  = tokenize_chunk(file_names_chunk, include_line_num)
        token_seqs.extend(tokenized_chunk)
    return token_seqs

In [None]:
def tokenize_code_pair_chunks(code_pairs, chunk_size=100, file_id=None):
    if file_id is None:
        file_id = ''
    file_names = []
    for i, codes in enumerate(code_pairs):
        for j, code in enumerate(codes):
            code_file_name = os.path.join(data_path,'tmp_files/{0}{1}_{2}.txt'.format(file_id,i,j))
            with open(code_file_name, 'w') as f:
                f.write(code)
            file_names.append(code_file_name)

    token_seqs = tokenize_files(file_names, chunk_size=chunk_size, return_list = True)

    #pair_len = len(code_pairs[0])
    #token_seq_pairs = [tuple(token_seqs[i:i+pair_len]) for i in range(0, len(token_seqs), pair_len)]

    token_seq_pairs = []
    i, j = 0, 0
    while i < len(token_seqs):
        pair_len = len(code_pairs[j])
        token_seq_pairs.append(tuple(token_seqs[i:i+pair_len]))
        i -=- pair_len
        j -=- 1


    return token_seq_pairs


def tokenize_code_pairs(code_pairs, chunk_size=100, file_id=None):
    token_seqs = deque()
    for i in range(0,len(code_pairs),chunk_size):
        code_pairs_chunk = code_pairs[i:i+chunk_size]
        tokenized_chunk  = tokenize_code_pair_chunks(code_pairs_chunk, chunk_size=chunk_size, file_id=file_id)
        token_seqs.extend(tokenized_chunk)
    return token_seqs

In [None]:
def tokenize_codes_chunk(code, file_id=None):
    assert isinstance(code,list)
    if file_id is None:
        file_id = ''
    file_names = []
    for i, c in enumerate(code):
        file_name = os.path.join(data_path,'tmp_files/tmp{0}{1}.txt'.format(file_id,i))
        with open(file_name, 'w') as f:
            f.write(c)
        file_names.append(file_name)
    token_seq = tokenize_files(file_names)
    return token_seq

def tokenize_code(code, include_line_num=False, file_id=None):
    if isinstance(code,list) or isinstance(code,deque):
        token_seqs = deque()
        for i in range(0,len(code),100):
            code_chunk = code[i:i+100]
            tokenized_chunk  = tokenize_codes_chunk(code_chunk, file_id)
            token_seqs.extend(tokenized_chunk)
        return token_seqs
    elif isinstance(code,str):
        file_name = os.path.join(data_path,'tmp_files/tmp.txt')
        with open(file_name, 'w') as f:
            f.write(code)
        token_seq = tokenize_files([file_name], include_line_num)[0]
        return token_seq
    else:
        raise Exception('Can only tokenize a string or list of strings')

In [None]:
def special_token(token):
    special_tokens = ['0','1','is_atom', 'is_integer', 'is_number', 'is_boolean', 'hd', 'tl', 'element',
                      'setelement', 'length', 'tuple_size', 'tuple_to_list', 'list_to_tuple', 'abs', '_',
                      'module','export', 'main', 'uniform', 'reverse', 'sort', 'subtract', 'sum', 'min',
                      'max', 'last', 'nth', 'sublist', 'zip', 'zip3', 'lists', 'map', 'filter', 'erlang',
                      'random', 'true', 'false']
    return token in special_tokens

def process_tokenized_code(tokenized_code, unk_dict=None, insert_beos=True, process_vars_by_chunk : bool = False):
    if unk_dict is None:
        unk_dict = {}
        idiomatic_preprocess = False
    else:
        idiomatic_preprocess = True

    tokens = ['BOS']
    if not insert_beos:
        tokens = []

    unk_num = {}
    def unknown(prefix,token):
        try:
            return unk_dict[token]
        except KeyError:
            try:
                unk_num[prefix] += 1
            except:
                unk_num[prefix] = 1
            unk_dict[token] = prefix + str(unk_num[prefix]-1)
            return unk_dict[token]

    for token_info in tokenized_code:
        try:
            tok, cls, line_num = token_info
            include_line_num = True
        except ValueError:
            tok, cls = token_info
            include_line_num = False

        if process_vars_by_chunk and tok == 'dot':
            unk_num = {}
            unk_dict = {}

        cls_none = cls is None
        spec_tok = special_token(tok)
        idiomatic_new = idiomatic_preprocess and tok not in unk_dict.keys()
        if (cls_none or spec_tok or idiomatic_new):
            if not include_line_num: tokens.append(tok)
            else:                    tokens.append((tok,line_num))
        else:
            if not include_line_num: tokens.append(unknown(cls,tok))
            else:                    tokens.append((unknown(cls,tok),line_num))
    if insert_beos:
        tokens.append('EOS')
    if idiomatic_preprocess:
        return tokens
    else:
        return (tokens,unk_dict)

In [None]:
#test = "f([]) == 0 -> 'HELLO'."
#test_tok = tokenize_code(test)
#processed_code = process_tokenized_code(test_tok, insert_beos=False)

In [None]:
#tokens, types = zip(*test_tok)
#
#print(test)
#for _ in tokens:
#    print('|l', end='')
#print('|')
#
#print('\hline')
#for t in tokens:
#    print(t, end=' & ')
#print('\\\\ \hline')
#for t in types:
#    print(t, end=' & ')
#print('\\\\ \hline')

f([]) == 0 -> 'HELLO'.
|l|l|l|l|l|l|l|l|l|l|
\hline
f & ( & [ & ] & ) & == & 0 & -> & 'HELLO' & dot & \\ \hline
atom & None & None & None & None & None & integer & None & atom & None & \\ \hline


In [None]:
#for elem in processed_code:
#    print(elem)

['atom0', '(', '[', ']', ')', '==', '0', '->', 'atom1', 'dot']
{'f': 'atom0', "'HELLO'": 'atom1'}


In [None]:
def can_tokenize(code):
    try:
        tokenize_code(code)
    except:
        return False
    else:
        return True