In [18]:
# %env TOKENIZERS_PARLLELISM=false
%env WANDB_PROJECT=O4

env: WANDB_PROJECT=O4


In [19]:
import gym
import compiler_gym                      # imports the CompilerGym environments
from compiler_gym.envs.llvm.datasets import CBenchDataset

import numpy as np
import pandas as pd
from torch import nn
import torch

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModelForPreTraining, RobertaForSequenceClassification
from transformers import Trainer
from transformers import PreTrainedTokenizerFast, BertTokenizerFast, RobertaTokenizerFast

import tokenizers
from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer
from tokenizers.processors import BertProcessing, RobertaProcessing

from datasets import Dataset

Compiler gym comes with many environments.

In [20]:
compiler_gym.COMPILER_GYM_ENVS

['gcc-v0',
 'llvm-v0',
 'llvm-ic-v0',
 'llvm-codesize-v0',
 'llvm-autophase-ic-v0',
 'llvm-autophase-codesize-v0',
 'llvm-ir-ic-v0',
 'llvm-ir-codesize-v0',
 'loop_tool-v0']

We are solving phase ordering.

In [21]:
env = gym.make("llvm-ic-v0")

The actions you can take are applying one among many different optimization passes.

In [22]:
env.reset()

In [31]:
env.benchmark

benchmark://cbench-v1/qsort

In [7]:
env.action_space

Commandline([-add-discriminators -adce -aggressive-instcombine -alignment-from-assumptions -always-inline -argpromotion -attributor -barrier -bdce -break-crit-edges -simplifycfg -callsite-splitting -called-value-propagation -canonicalize-aliases -consthoist -constmerge -constprop -coro-cleanup -coro-early -coro-elide -coro-split -correlated-propagation -cross-dso-cfi -deadargelim -dce -die -dse -reg2mem -div-rem-pairs -early-cse-memssa -early-cse -elim-avail-extern -ee-instrument -flattencfg -float2int -forceattrs -inline -insert-gcov-profiling -gvn-hoist -gvn -globaldce -globalopt -globalsplit -guard-widening -hotcoldsplit -ipconstprop -ipsccp -indvars -irce -infer-address-spaces -inferattrs -inject-tli-mappings -instsimplify -instcombine -instnamer -jump-threading -lcssa -licm -libcalls-shrinkwrap -load-store-vectorizer -loop-data-prefetch -loop-deletion -loop-distribute -loop-fusion -loop-guard-widening -loop-idiom -loop-instsimplify -loop-interchange -loop-load-elim -loop-predicati

At each step, you "observe" a string which contains the IR.

In [8]:
env.observation_space

Check which benchmark (program) is being used.

In [9]:
env.benchmark

benchmark://cbench-v1/qsort

In [10]:
env.reset()                              # starts a new compilation session
# env.render()                             # prints the IR of the program
env.step(env.action_space.sample())      # applies a random optimization, updates state/reward/actions
# env.close()                              # closes the environment, freeing resources
# env.observation["Ir"]

(None,
 0.07430340557275542,
 False,
 {'action_had_no_effect': False, 'new_action_space': False})

In [11]:
env.observation

ObservationView[Autophase, AutophaseDict, Bitcode, BitcodeFile, Buildtime, CpuInfo, Inst2vec, Inst2vecEmbeddingIndices, Inst2vecPreprocessedText, InstCount, InstCountDict, InstCountNorm, InstCountNormDict, Ir, IrInstructionCount, IrInstructionCountO0, IrInstructionCountO3, IrInstructionCountOz, IrSha1, IsBuildable, IsRunnable, ObjectTextSizeBytes, ObjectTextSizeO0, ObjectTextSizeO3, ObjectTextSizeOz, Programl, ProgramlJson, Runtime]

## Dataset

In [12]:
SAMPLES = 10
PHASES = 5

In [13]:
def single_sampler(samples=SAMPLES, phases=PHASES):
    for _ in range(samples):
        env.reset()
        for phase in range(phases):
            action = env.action_space.sample()
            _, reward, done, info = env.step(action)
            env.action_space.to_string(action)
            if done: break
            yield "\n".join(env.observation['Inst2vecPreprocessedText'])

def sampler(samples=SAMPLES, phases=PHASES):
    for benchmark in env.datasets["cbench-v1"].benchmarks():
        print(benchmark)
        for _ in range(samples):
            env.reset(benchmark=benchmark)
            for phase in range(phases):
                action = env.action_space.sample()
                _, reward, done, info = env.step(action)
                env.action_space.to_string(action)
                if done: break
                yield env.observation['Inst2vecPreprocessedText']

In [14]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import CharDelimiterSplit, Sequence, Split
from tokenizers.processors import RobertaProcessing

tokenizer = Tokenizer(WordPiece(vocab=env.inst2vec.vocab))
tokenizer.pre_tokenizer = CharDelimiterSplit("\n")
# tokenizer.pre_tokenizer = Split(pattern="[INST]", behavior="removed")
tokenizer.enable_truncation(1024)

In [15]:
base_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[INST]"]
inst_tokens = list(env.inst2vec.vocab.keys())
special_tokens = base_tokens + inst_tokens

In [16]:
trainer = WordPieceTrainer(
    special_tokens=base_tokens ,
    vocab_size = len(env.inst2vec.vocab)
)

In [17]:
# tokenizer.train_from_iterator(single_sampler(), trainer=trainer)
tokenizer.post_processor = RobertaProcessing(
    cls=("[CLS]", tokenizer.token_to_id("[CLS]")),
    sep=("[SEP]", tokenizer.token_to_id("[SEP]"))
)

TypeError: 'NoneType' object cannot be interpreted as an integer

In [110]:
sample = env.observation['Inst2vecPreprocessedText']
joined = "\n".join(sample)
tokens = tokenizer.encode(joined).tokens
len(joined), len(tokens), len(sample), 

(30470, 565, 563)

In [120]:
tokenizer.save("tokenizer.json")

## Load into Transformers

In [154]:
from transformers import PreTrainedTokenizerFast

In [250]:
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file='tokenizer.json', max_len_single_sentence=1024)
fast_tokenizer.add_tokens(env.action_space.names)
fast_tokenizer.add_special_tokens({
    'cls_token': '[CLS]',
    'pad_token': '[PAD]',
    'sep_token': '[SEP]',
})
# fast_tokenizer.set_truncation_and_padding(
#     padding_strategy='longest',
#     truncation_strategy='longest_first',
#     stride=0,
#     max_length=1024,
#     pad_to_multiple_of=8
# )

0

In [251]:
act = env.action_space.names[env.action_space.sample()]

In [252]:
fast_tokenizer.convert_tokens_to_ids(act)

1089

In [253]:
tokenized = fast_tokenizer([act], env.observation['Inst2vecPreprocessedText'],
          is_split_into_words=True,
          padding=True,
          truncation=True,
          max_length=1024,
          return_tensors="pt")
tokenized

{'input_ids': tensor([[   1, 1089,    2,    2,  656,  656,  656,    0,  856,  855,  631,  895,
          858,  631,  828,  860,  859,  631,  857,  827,  863,  387,  387,  740,
          740,  336,  336,  582,  292,  582,  292,  294,  742,    0,  547,  645,
          294,  742,    0,  547,  645,  547,  547,  874,  330,  222,  298,  222,
          547,  547,  875,  921,  892,  298,  339,  907,  582,  292,  582,  292,
          723,  862,  366,  711,  366,    0,  889,    0,  366,  366,  366,  366,
          366,  383,  738,  383,    0,  292,    0,  753,  292,    0,  292,  436,
          292,  436,  292,  452,  436,  292,  436,  292,  436,  292,  304,  727,
          330,  222,    0,    0,   62,   89,   79,   94,   75,  514,   94,  572,
          682,   99,  747,  222,  757,  891,  294,    0,    0,  298,  339,    0,
            0,  562,  330,  222,    0,    0,  562,  330,  222,    0,    0,  562,
          330,  222,  304,  727,  298,  752,  902,  330,  222,  304,  304,  448,
            0,

## Preprocessing

In [160]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [263]:
def dataset_sampler(samples=SAMPLES, phases=PHASES):
    # for benchmark in env.datasets["cbench-v1"].benchmarks():
    #     print(benchmark)
        # for _ in range(samples):
        #     env.reset(benchmark=benchmark)
    for _ in range(samples):
        env.reset()
        for phase in range(phases):
            action = env.action_space.sample()
            _, reward, done, info = env.step(action)
            env.action_space.to_string(action)
            if done: break
            action = env.action_space.to_string(action)
            text = env.observation['Inst2vecPreprocessedText']
            label = reward
            yield  [action] + text, reward

In [264]:
# train_samples = [{"action": a, "text": t, "reward": r} for a, t, r in dataset_sampler(64, 8)]
# eval_samples = [{"action": a, "text": t, "reward": r} for a, t, r in dataset_sampler(8, 8)]

train_samples = [{"text": x, "label": y} for x, y in dataset_sampler(64, 8)]
eval_samples = [{"text": x, "label": y} for x, y in dataset_sampler(8, 8)]

In [265]:
train_dict = {k: v for d in train_samples for k, v in d.items()}
eval_dict = {k: v for d in eval_samples for k, v in d.items()}

In [266]:
train_df = pd.DataFrame(train_samples).astype('object')
eval_df = pd.DataFrame(eval_samples).astype('object')
train_df.head()

Unnamed: 0,text,label
0,"[-callsite-splitting, opaque = type opaque, op...",0.0
1,"[-prune-eh, opaque = type opaque, opaque = typ...",0.0
2,"[-loop-unroll-and-jam, opaque = type opaque, o...",-0.003096
3,"[-loop-instsimplify, opaque = type opaque, opa...",0.0
4,"[-inline, opaque = type opaque, opaque = type ...",-0.498452


In [267]:
# train_ds = Dataset.from_dict(train_dict)
# eval_ds = Dataset.from_dict(eval_dict)

train_ds = Dataset.from_pandas(train_df)
eval_ds = Dataset.from_pandas(eval_df)

In [277]:
def preprocess(example):
    return fast_tokenizer(example['text'],
                          is_split_into_words=True,
                          padding=True,
                          truncation=True,
                          max_length=1024,
                         )

tokenized_train = train_ds.map(preprocess, batched=True)
tokenized_valid = eval_ds.map(preprocess, batched=True)

columns = ['input_ids', 'token_type_ids', 'label']
tokenized_train.set_format(type='torch', columns=columns)
tokenized_valid.set_format(type='torch', columns=columns)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [271]:
type(tokenized_train)

datasets.arrow_dataset.Dataset

In [276]:
tokenized_train[0]['input_ids'].shape

torch.Size([1024])