In [None]:
%env TOKENIZERS_PARLLELISM=false
%env WANDB_PROJECT=O4

run_name="bert-base-high-lr"

In [None]:
import gym
import compiler_gym                      # imports the CompilerGym environments

import pandas as pd
# from torch.utils.data import Dataset, DataLoader
from torch import nn

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModelForPreTraining, RobertaForSequenceClassification
from transformers import Trainer
from transformers import PreTrainedTokenizerFast, BertTokenizerFast, RobertaTokenizerFast
from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer
from tokenizers.processors import BertProcessing
from datasets import Dataset

Compiler gym comes with many environments.

In [None]:
compiler_gym.COMPILER_GYM_ENVS

We are solving phase ordering.

In [None]:
env = gym.make("llvm-ic-v0")

The actions you can take are applying one among many different optimization passes.

In [None]:
env.action_space

At each step, you "observe" a string which contains the IR.

In [None]:
env.observation_space

Check which benchmark (program) is being used.

In [None]:
env.benchmark

In [None]:
env.reset()                              # starts a new compilation session
# env.render()                             # prints the IR of the program
env.step(env.action_space.sample())      # applies a random optimization, updates state/reward/actions
# env.close()                              # closes the environment, freeing resources
# env.observation["Ir"]

In [None]:
env.observation

## Dataset

In [None]:
SAMPLES = 10
PHASES = 5

In [None]:
def sampler(samples=SAMPLES, phases=PHASES):
    for _ in range(samples):
        env.reset()
        for phase in range(phases):
            action = env.action_space.sample()
            _, reward, done, info = env.step(action)
            env.action_space.to_string(action)
            if done: break
            yield env.observation['Ir']

In [None]:
# tokenizer = ByteLevelBPETokenizer()
# # tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
# tokenizer.train_from_iterator(sampler(), special_tokens=["[SEP]", "[CLS]"])
# tokenizer.save("vocab/tokenizer.json")
# tokenizer.save_model("vocab/")

In [None]:
# # Prepare the tokenizer
# tokenizer._tokenizer.post_processor = BertProcessing(("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]")),)
# tokenizer.enable_truncation(max_length=512)

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('microsoft/codebert-base-mlm')

## Preprocessing

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def dataset_sampler(samples=SAMPLES, phases=PHASES):
    for _ in range(samples):
        env.reset()
        for phase in range(phases):
            action = env.action_space.sample()
            _, reward, done, info = env.step(action)
            if done: break
            text = env.action_space.to_string(action) + env.observation['Ir']
            label = reward
            yield  text, label

In [None]:
train_samples = [{"text": x, "label": y} for x, y in dataset_sampler(1000, 20)]
valid_samples = [{"text": x, "label": y} for x, y in dataset_sampler(200, 20)]

train_df = pd.DataFrame(train_samples)
valid_df = pd.DataFrame(valid_samples)

train_df.head()

In [None]:
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)

In [None]:
def preprocess(example):
    return tokenizer(example['text'], padding=True, truncation=True)

tokenized_train = train_ds.map(preprocess, batched=True)
tokenized_valid = valid_ds.map(preprocess, batched=True)

columns = ['input_ids', 'label', 'attention_mask']
tokenized_train.set_format(type='torch', columns=columns)
tokenized_valid.set_format(type='torch', columns=columns)

## Model

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained("huggingface/CodeBERTa-small-v1")
# model.resize_token_embeddings(len(tokenizer))

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
# tokenizer = PreTrainedTokenizerFast(tokenizer_file="vocab/tokenizer.json")
# tokenizer = RobertaTokenizerFast.from_pretrained('vocab', max_len=512)
# tokenizer = BertTokenizerFast(vocab_file="vocab/vocab.json")
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base-mlm', num_labels=1)

## Training

In [None]:
class CostModelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.MSELoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="wandb",
    run_name="codebert-llvm-ic-v0",
)

trainer = CostModelTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
