In [1]:
import tensorflow_decision_forests as tfdf
import tensorflow as tf
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split

import re

tf.experimental.numpy.experimental_enable_numpy_behavior()

2024-01-29 19:39:15.203366: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-29 19:39:15.203455: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-29 19:39:15.258304: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-29 19:39:15.384168: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
data = pd.read_csv(
    "./1-100 plus a and b.csv",
    names=["Code", "Assembly"])

start_char = "Ø"
end_char = "⁂"
numerical_char = "✦"

# Constrain data to constants or functions on a single variable, using the variable once
data["Code"] = data["Code"].apply(lambda x: x.replace("int func(int a, int b)","int func()"))
data = data[~data["Code"].str.contains(r' b |b;|a . a',regex=True)]
# Fix function headers
data["Code"] = data["Code"].apply(lambda x: x.replace("int func()","int func()") if re.search(r' a |a;',x) else x)
# Add spaces around punctuation
data["Code"] = data["Code"].apply(lambda x: re.sub(r'([\{\};\(\)\,])', r' \1 ', x))
# normalize variable name to "variable"
data["Code"] = data["Code"].apply(lambda x: x.replace(" a "," variable "))
# Whitelist certain operators from the training set
data = data[data["Code"].str.contains(r' \+ | \- | \* ', regex=True)]
# pull digits for training
data["Code Digits"] = data["Code"].apply(lambda x: re.findall(r'\d+', x))
# Remove features present in every program. There is not enough data for the model to understand what these features should mean
data["Code"] = data["Code"].apply(lambda x: re.sub(r' func| \{| \}| \(| \)|int| ;| return',"",x))

# Uses heuristics to create the operator lookup table
data["Operator"] = data["Code"].apply(lambda x: re.findall(r' [\+\-%*\/] ',x)[0])
# Creates the lookup table from the Code templates and the processed assembly
operator_lookup = data["Operator"].drop_duplicates().values.tolist()
# Gets the operator index for each code sample
data["Mapped Operator"] = data["Operator"].apply(lambda x: operator_lookup.index(x))

# Strip the excess
data["Code"] = data["Code"].apply(lambda x: x.replace("variable variable","variable"))
# Adds positional data to the encodings
def add_positiong_to_tokens(code):
    tokens = code.split()
    for i in range(len(tokens)):
        tokens[i] = f"{i}_{tokens[i]}"
    return " ".join(tokens)
data["Code"] = data["Code"].apply(lambda x: add_positiong_to_tokens(x))

# Convert assembly to "templates" which don't contain constant numbers derived from the code.
# This vastly reduces the number of possible outputs for a given code line.
# The model will manually fill in the template using data from the code after it has compiled it
r_assembly_digit = r'(?<= )[\-]?\d+'
data["Assembly Digits"] = data["Assembly"].apply(lambda x: re.findall(r_assembly_digit, x))
data["Assembly Templates"] = data["Assembly"].apply(lambda x: re.sub(r_assembly_digit, numerical_char, x)) + f"\n{end_char}"

data

Unnamed: 0,Code,Assembly,Code Digits,Operator,Mapped Operator,Assembly Digits,Assembly Templates
0,0_1 1_+ 2_1,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D...","[1, 1]",+,0,[2],"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
1,0_1 1_- 2_1,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D...","[1, 1]",-,1,[0],"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
2,0_1 1_* 2_1,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D...","[1, 1]",*,2,[1],"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
5,0_1 1_+ 2_2,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D...","[1, 2]",+,0,[3],"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
6,0_1 1_- 2_2,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D...","[1, 2]",-,1,[-1],"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
...,...,...,...,...,...,...,...
50479,0_variable 1_- 2_98,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D...",[98],-,1,[98],"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
50480,0_variable 1_* 2_98,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D...",[98],*,2,[98],"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
50483,0_variable 1_+ 2_99,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D...",[99],+,0,[99],"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
50484,0_variable 1_- 2_99,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D...",[99],-,1,[99],"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."


In [4]:
gen_data = pd.concat([data["Code"],data["Assembly Templates"]],axis=1)
gen_data = gen_data.drop_duplicates()

gen_data.reset_index(drop=True)

gen_data

Unnamed: 0,Code,Assembly Templates
0,0_1 1_+ 2_1,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
1,0_1 1_- 2_1,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
2,0_1 1_* 2_1,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
5,0_1 1_+ 2_2,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
6,0_1 1_- 2_2,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
...,...,...
50479,0_variable 1_- 2_98,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
50480,0_variable 1_* 2_98,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
50483,0_variable 1_+ 2_99,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."
50484,0_variable 1_- 2_99,"func(int, int):\npush rbp\nmov rbp, rsp\nmov D..."


In [5]:
set_length = 0
for a in gen_data["Assembly Templates"]:
    set_length += len(a.split("\n"))

code_context = np.empty(shape=(set_length),dtype=object)
gen_context = np.empty(shape=(set_length),dtype=object)
labels = np.empty(shape=(set_length),dtype=int)
gen_data.reset_index(drop=True)

assembly_lookup = []

data_i = 0
for ri, row in gen_data.iterrows():
    assembly = row["Assembly Templates"]
    code = row["Code"]

    tokenized_code = code.split()
    tokenized = assembly.split("\n")

    for ti in range(len(tokenized)):
        if (tokenized[ti] != end_char):
            tokenized[ti] = f"{ti}_{tokenized[ti]}"

        t = tokenized[ti]

        if (not (t in assembly_lookup)):
            assembly_lookup += [t]

        code_context[data_i] = code
        gen_context[data_i] = "\n".join(tokenized[:ti])
        labels[data_i] = assembly_lookup.index(tokenized[ti])
        data_i += 1

gen_dataset = tf.data.Dataset.from_tensor_slices(({"code": code_context, "assembly": gen_context},labels)).batch(1000)

2024-01-29 19:40:23.680927: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:3b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-29 19:40:23.681121: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:3b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-29 19:40:23.681167: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:3b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-29 19:40:27.836414: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:3b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-29 19:40:27.836634: I external/local_xla/xla/stream_executor

In [7]:
def prepare_dataset(features, labels):
  features = {"code": tf.strings.split(features["code"]),"assembly": tf.strings.split(features["assembly"],sep="\n")}
  return features, labels

gen_dataset = gen_dataset.map(prepare_dataset)

In [13]:
#tuner = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True)
# We want to overfit, since this is a compilation problem and we are training on all the inputs.
gen_model = tfdf.keras.GradientBoostedTreesModel(validation_ratio=0.0)
gen_model.fit(gen_dataset, num_trees = 90, verbose=2)

Use /tmp/tmp63wmb2k3 as temporary training directory








Reading training dataset...
Training tensor examples:
Features: {'code': tf.RaggedTensor(values=Tensor("data:0", shape=(None,), dtype=string), row_splits=Tensor("data_1:0", shape=(None,), dtype=int64)), 'assembly': tf.RaggedTensor(values=Tensor("data_2:0", shape=(None,), dtype=string), row_splits=Tensor("data_3:0", shape=(None,), dtype=int64))}
Label: Tensor("data_4:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'code': SemanticTensor(semantic=<Semantic.CATEGORICAL_SET: 4>, tensor=tf.RaggedTensor(values=Tensor("data:0", shape=(None,), dtype=string), row_splits=Tensor("data_1:0", shape=(None,), dtype=int64))), 'assembly': SemanticTensor(semantic=<Semantic.CATEGORICAL_SET: 4>, tensor=tf.RaggedTensor(values=Tensor("data_2:0", shape=(None,), dtype=string), row_splits=Tensor("data_3:0", shape=(None,), dtype=int64)))}
Training dataset read in 0:00:01.044290. Found 270757 examples.
Training model...


[INFO 24-01-29 19:48:10.2072 EST kernel.cc:771] Start Yggdrasil model training
[INFO 24-01-29 19:48:10.2072 EST kernel.cc:772] Collect training examples
[INFO 24-01-29 19:48:10.2072 EST kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 24-01-29 19:48:10.2073 EST kernel.cc:391] Number of batches: 271
[INFO 24-01-29 19:48:10.2073 EST kernel.cc:392] Number of examples: 270757
[INFO 24-01-29 19:48:10.4100 EST kernel.cc:792] Training dataset:
Number of records: 270757
Number of columns: 3

Number of columns by type:
	CATEGORICAL_SET: 2 (66.6667%)
	CATEGORICAL: 1 (33.3333%)

Columns:

CATEGORICAL_SET: 2 (66.6667%)
	1: "assembly" CATEGORICAL_SET num-nas:29

Model trained in 1:27:18.539697
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x7f27424f6560>

In [63]:
def generate_line(code, context = ""):
    return np.argmax(gen_model({"code":tf.strings.split([code]), "assembly":tf.strings.split([context],sep="\n")})[0])

def generate_template(code, sanity=50):
    code = re.sub(r'([\{\};\(\)\,])', r' \1 ', code)
    interim = ""
    while not interim.endswith(end_char + "\n") and sanity > 0:
        interim += assembly_lookup[generate_line(code,interim)] + "\n"
        sanity -= 1
    return interim

In [23]:
def sample_and_save_dataset(percentage = 1):
    sampled_data = data[::round(1/percentage)]
    time_str = datetime.now().strftime('%m-%d-%Y--%H-%M-%S')
    file_loc = f"datasets/{time_str}.pkl"
    sampled_data.to_pickle(file_loc)
    print(f"Saved to '{file_loc}'")
    return sampled_data

def load_dataset(location):
    return pd.read_pickle(location)

# sampled_data = sample_and_save_dataset(0.5)

# train/validation = 80%, test = 20%
train_data, test_data = train_test_split(data, test_size=0.2)

In [24]:
#x_input = tf.keras.Input(shape=(1,),dtype=tf.string)
#x = code_vectorizer(x_input)
#x = tf.keras.layers.Embedding(label_length,class_count)(x)
#x = tf.keras.layers.GRU(class_count)(x)
#x = tf.keras.layers.Dense(class_count, activation='softmax')(x)

#model = tf.keras.Model(inputs=x_input,outputs=x)

pf = pd.concat([data["Code"],data["Mapped Operator"]],axis=1)
pf = pf.rename(columns={"Code": "features", "Mapped Operator": "labels"})
tf_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(pf, label="labels")

def prepare_dataset(features, labels):
  features = {"features": tf.strings.split(features["features"])}
  return features, labels

tf_dataset = tf_dataset.map(prepare_dataset)

op_model = tfdf.keras.RandomForestModel(num_trees=50,verbose=2)
op_history = op_model.fit(tf_dataset)

Use 8 thread(s) for training
Use /tmp/tmp1uxwmslg as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'features': tf.RaggedTensor(values=Tensor("data:0", shape=(None,), dtype=string), row_splits=Tensor("data_1:0", shape=(None,), dtype=int64))}
Label: Tensor("data_2:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'features': SemanticTensor(semantic=<Semantic.CATEGORICAL_SET: 4>, tensor=tf.RaggedTensor(values=Tensor("data:0", shape=(None,), dtype=string), row_splits=Tensor("data_1:0", shape=(None,), dtype=int64)))}
Training dataset read in 0:00:00.787228. Found 29997 examples.
Training model...


[INFO 24-01-30 14:23:31.5866 EST kernel.cc:771] Start Yggdrasil model training
[INFO 24-01-30 14:23:31.5889 EST kernel.cc:772] Collect training examples
[INFO 24-01-30 14:23:31.5896 EST kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 24-01-30 14:23:31.5975 EST kernel.cc:391] Number of batches: 30
[INFO 24-01-30 14:23:31.5975 EST kernel.cc:392] Number of examples: 29997
[INFO 24-01-30 14:23:31.6127 EST kernel.cc:792] Training dataset:
Number of records: 29997
Number of columns: 2

Number of columns by type:
	CATEGORICAL_SET: 1 (50%)
	CATEGORICAL: 1 (50%)

Columns:

CATEGORICAL_SET: 1 (50%)
	1: "features" CATEGORICAL_SET has-dict vocab-size:204 zero

Model trained in 0:00:01.318909
Compiling model...
Model compiled.


In [25]:
def classify_operator(code):
    return np.argmax(op_model.call({"features": tf.strings.split([code])})[0])

In [49]:
### OPTIMIZATION ###

### Here, it generates a model per operator. These train off the data as well
### This number is data dependent

operator_models = [None] * len(operator_lookup)

early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

for ri in range(len(operator_lookup)):

    operator_models[ri] = tf.keras.Sequential([
        tf.keras.layers.Dense(256),
        tf.keras.layers.Dense(256),
        tf.keras.layers.Dense(256),
        tf.keras.layers.Dense(3)
    ])
    operator_models[ri].compile(loss="mse",optimizer="adam")

    relevant_data = data[data["Mapped Operator"] == ri]

    relevant_data["Code Digits"] = relevant_data["Code Digits"].apply(lambda x: x + ([0] * (3-len(x))))
    relevant_data["Assembly Digits"] = relevant_data["Assembly Digits"].apply(lambda x: x + ([0] * (3-len(x))))

    inputs = np.stack(relevant_data["Code Digits"].to_numpy()).astype(int)
    outputs = np.stack(relevant_data["Assembly Digits"].to_numpy()).astype(int)

    operator_models[ri].fit(x=inputs, y=outputs, validation_split=0.1, epochs=100, callbacks=[early_stopping])


Epoch 1/100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data["Code Digits"] = relevant_data["Code Digits"].apply(lambda x: x + ([0] * (3-len(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data["Assembly Digits"] = relevant_data["Assembly Digits"].apply(lambda x: x + ([0] * (3-len(x))))


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 1/100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data["Code Digits"] = relevant_data["Code Digits"].apply(lambda x: x + ([0] * (3-len(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data["Assembly Digits"] = relevant_data["Assembly Digits"].apply(lambda x: x + ([0] * (3-len(x))))


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data["Code Digits"] = relevant_data["Code Digits"].apply(lambda x: x + ([0] * (3-len(x))))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data["Assembly Digits"] = relevant_data["Assembly Digits"].apply(lambda x: x + ([0] * (3-len(x))))


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


In [19]:
def compile_numbers(code_numbers, operator):
    output = operator_models[operator](np.array([code_numbers]).astype(int))[0][0]
    return round(output)

In [20]:
def splice_numbers_into_assembly(template,digit):
    return template.replace(numerical_char,str(digit.numpy().astype(int)))

In [42]:
def remove_prefixes(template):
    lines = template.split('\n')
    lines = [(line.split('_')[1] if ("_" in line) else line) for line in lines]
    return "\n".join(lines)

In [64]:
def compile_code(code):
    digits = re.findall(r'\d+', code)
    digits = digits + ([0] * (3 - len(digits)))

    tokens = code.split()
    for i in range(len(tokens)):
        tokens[i] = f"{i}_{tokens[i]}"
    code = " ".join(tokens)

    operator_n = classify_operator(code)
    final_digit = compile_numbers(digits,operator_n)
    assembly_template = generate_template(code)
    spliced_assembly = splice_numbers_into_assembly(assembly_template, final_digit)
    return remove_prefixes(spliced_assembly)

In [78]:
print(compile_code("384"))

push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi
mov DWORD PTR [rbp-8], esi
mov eax, 384
pop rbp
ret
⁂

