In [72]:
import datasets
import ast
from nova_py import TACO
import tensorflow as tf
import re
import pickle
import string
import itertools

# Overall Vocabulary build

In [11]:
class Vocabulary:
    def __init__(self):
        self.taco = {
            "patterns": {
                "quote_search": r'"[^"]*"|[^"\s]+',
                "word_match": r'\w+|[^\w\s]',
                "quote_split": r'\"+|.+(?<!\")'
            },
            "splits": ["\"", "'"],
            "tokens": {}
        }
        self.performer = {
            "in_tokens": {},
            "out_tokens": {}
        }

In [12]:
v = Vocabulary()

# Main (Output vocabulary for performer)

In [13]:
dataset_name = "codeparrot/codeparrot-clean"

vocab_set = datasets.load_dataset(dataset_name, 'default', split='train', streaming=True)

Resolving data files:   0%|          | 0/54 [00:00<?, ?it/s]

In [14]:
vocab_text = [i['content'] for i in vocab_set.take(300)]

In [15]:
trees = []
for script in vocab_text:
    try:
        trees.append(ast.dump(ast.parse(script)))
    except:
        print("Escaped")

Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped
Escaped


In [16]:
splits = []
for tree in trees:
    split = re.findall(r"(?sx)(?:\"\"\"(?:\\.|(?!\"\"\").)*?\"\"\"|'''(?:\\.|(?!''').)*?'''|\"(?:(?:[^\"\\]|\\.)*)\"|'(?:(?:[^'\\]|\\.)*)'|[^\"'\d]+)", tree)
    splits.append(split)

In [17]:
tokens = []
for split in splits:
    for string in split:
        num_quotes = len(re.findall(r"[\"\']", string))
        if not num_quotes > 1:
            tokens.append(string)

In [18]:
cleaning_pattern = r'\w+|\W'

clean_tokens = []
for t in tokens:
    for t2 in re.findall(cleaning_pattern, t):
        clean_tokens.append(t2)

In [19]:
vocabulary, _ = tf.unique(tf.constant(clean_tokens))

In [20]:
vocabulary

<tf.Tensor: shape=(157,), dtype=string, numpy=
array([b'Module', b'(', b'body', b'=', b'[', b'ImportFrom', b'module',
       b',', b' ', b'names', b'alias', b'name', b')', b']', b'level',
       b'Import', b'ClassDef', b'bases', b'Name', b'id', b'ctx', b'Load',
       b'keywords', b'FunctionDef', b'args', b'arguments', b'posonlyargs',
       b'arg', b'kwonlyargs', b'kw_defaults', b'defaults', b'Expr',
       b'value', b'Call', b'func', b'Attribute', b'Constant', b'attr',
       b'decorator_list', b'While', b'test', b'True', b'kind', b'b',
       b'keyword', b'YieldFrom', b'orelse', b'If', b'Compare', b'left',
       b'ops', b'Eq', b'comparators', b'Assign', b'targets', b'Store',
       b'False', b'type_ignores', b'vararg', b'kwarg', b'None', b'IfExp',
       b'Subscript', b'slice', b'Return', b'AugAssign', b'target', b'op',
       b'Add', b'Tuple', b'elts', b'List', b'IsNot', b'ListComp', b'elt',
       b'generators', b'comprehension', b'iter', b'ifs', b'UnaryOp',
       b'Not', b'oper

In [21]:
flat_tokens, _ = tf.unique(tf.reshape(tokens, [-1]))

In [22]:
flat_tokens

<tf.Tensor: shape=(7104,), dtype=string, numpy=
array([b'Module(body=[ImportFrom(module=', b', names=[alias(name=',
       b'), alias(name=', ...,
       b', ctx=Load()), args=[ListComp(elt=Tuple(elts=[Name(id=',
       b', ctx=Load()), ctx=Load())], ctx=Load()), generators=[comprehension(target=Name(id=',
       b', ctx=Load())], keywords=[])), body=[Return(value=Constant(value=False))], orelse=[]), Return(value=Compare(left=Call(func=Attribute(value=Name(id='],
      dtype=object)>

In [57]:
text_dump = [w.decode('utf-8') for w in vocabulary.numpy()]

In [58]:
len(text_dump)

157

In [59]:
v.performer['out_tokens'] = {i+1: text_dump[i] for i in range(0, len(text_dump))}

In [60]:
special_tokens = ["#MEM", "\'", "\"", "#PAD"]

In [61]:
v.performer['out_tokens'] = {**v.performer['out_tokens'],
                             **{i+len(v.performer['out_tokens'].values())+1: special_tokens[i] for i in range(0, len(special_tokens))}}

In [62]:
v.performer

{'in_tokens': {},
 'out_tokens': {1: 'Module',
  2: '(',
  3: 'body',
  4: '=',
  5: '[',
  6: 'ImportFrom',
  7: 'module',
  8: ',',
  9: ' ',
  10: 'names',
  11: 'alias',
  12: 'name',
  13: ')',
  14: ']',
  15: 'level',
  16: 'Import',
  17: 'ClassDef',
  18: 'bases',
  19: 'Name',
  20: 'id',
  21: 'ctx',
  22: 'Load',
  23: 'keywords',
  24: 'FunctionDef',
  25: 'args',
  26: 'arguments',
  27: 'posonlyargs',
  28: 'arg',
  29: 'kwonlyargs',
  30: 'kw_defaults',
  31: 'defaults',
  32: 'Expr',
  33: 'value',
  34: 'Call',
  35: 'func',
  36: 'Attribute',
  37: 'Constant',
  38: 'attr',
  39: 'decorator_list',
  40: 'While',
  41: 'test',
  42: 'True',
  43: 'kind',
  44: 'b',
  45: 'keyword',
  46: 'YieldFrom',
  47: 'orelse',
  48: 'If',
  49: 'Compare',
  50: 'left',
  51: 'ops',
  52: 'Eq',
  53: 'comparators',
  54: 'Assign',
  55: 'targets',
  56: 'Store',
  57: 'False',
  58: 'type_ignores',
  59: 'vararg',
  60: 'kwarg',
  61: 'None',
  62: 'IfExp',
  63: 'Subscript',
  6

In [73]:
chars = list(string.printable)

In [75]:
tokenizer_vocab = chars + list(itertools.product(chars, repeat=2))

In [77]:
v.taco['tokens'] = {i+1: tokenizer_vocab[i] for 

{'patterns': {'quote_search': '"[^"]*"|[^"\\s]+',
  'word_match': '\\w+|[^\\w\\s]',
  'quote_split': '\\"+|.+(?<!\\")'},
 'splits': ['"', "'"],
 'tokens': {}}

# Save Vocabulary

In [64]:
# Save to a pickle file
with open("/Users/joericks/Desktop/nova/nova-py/src/nova_py/model/vocab.pkl", "wb") as f:
    pickle.dump(v, f)

# Regex Testing

In [239]:
def word_split(string):
    if string == "":
        return
    quote_match = r'"[^"]*"|[^"\s]+'
    splits = re.findall(quote_match, string)
    custom_match = r'\w+|[^\w\s]'
    quote_split = r'\"+|.+(?<!\")'
    tokens = []
    for i in splits:
        if "\"" in i:
            arr = re.findall(quote_split, i)
            tokens += arr
        else:
            arr = re.findall(custom_match, i)
            tokens += arr
    return tokens

In [242]:
test = "hello =\"good world!\" goodbye = 0"

In [243]:
word_split(test)

['hello', '=', '"', 'good world!', '"', 'goodbye', '=', '0']