In [None]:
# %env TOKENIZERS_PARLLELISM=false
%env WANDB_PROJECT=O4

In [None]:
import gym
import compiler_gym                      # imports the CompilerGym environments
from compiler_gym.envs.llvm.datasets import CBenchDataset

import numpy as np
import pandas as pd
from torch import nn
import torch

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModelForPreTraining, RobertaForSequenceClassification
from transformers import Trainer
from transformers import PreTrainedTokenizerFast, BertTokenizerFast, RobertaTokenizerFast
from transformers import DataCollatorWithPadding

from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer
from tokenizers.processors import BertProcessing

from datasets import Dataset

Compiler gym comes with many environments.

In [None]:
compiler_gym.COMPILER_GYM_ENVS

We are solving phase ordering.

In [None]:
env = gym.make("llvm-ic-v0")

The actions you can take are applying one among many different optimization passes.

In [None]:
env.reset()

In [None]:
env.benchmark

In [None]:
env.action_space

At each step, you "observe" a string which contains the IR.

In [None]:
env.observation_space

Check which benchmark (program) is being used.

In [None]:
env.benchmark

In [None]:
env.reset()                              # starts a new compilation session
# env.render()                             # prints the IR of the program
env.step(env.action_space.sample())      # applies a random optimization, updates state/reward/actions
# env.close()                              # closes the environment, freeing resources
# env.observation["Ir"]

In [None]:
env.observation

## Tokenizer

In [None]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file='tokenizer.json', max_len_single_sentence=1024)
tokenizer.add_tokens(env.action_space.names)
tokenizer.add_special_tokens({
    'cls_token': '[CLS]',
    'pad_token': '[PAD]',
    'sep_token': '[SEP]',
})

In [None]:
act = env.action_space.names[env.action_space.sample()]
tokenized = tokenizer([act], env.observation['Inst2vecPreprocessedText'],
          is_split_into_words=True,
          padding=True,
          truncation=True,
          max_length=1024,
          return_tensors="pt")

## Datasets

In [None]:
SAMPLES = 10
PHASES = 5

In [None]:
def dataset_sampler(samples=SAMPLES, phases=PHASES):
    # for benchmark in env.datasets["cbench-v1"].benchmarks():
    #     print(benchmark)
        # for _ in range(samples):
        #     env.reset(benchmark=benchmark)
    for _ in range(samples):
        env.reset()
        for phase in range(phases):
            action = env.action_space.sample()
            _, reward, done, info = env.step(action)
            env.action_space.to_string(action)
            if done: break
            action = env.action_space.to_string(action)
            text = env.observation['Inst2vecPreprocessedText']
            label = reward
            yield  [action] + text, reward

In [None]:
# train_samples = [{"action": a, "text": t, "reward": r} for a, t, r in dataset_sampler(64, 8)]
# eval_samples = [{"action": a, "text": t, "reward": r} for a, t, r in dataset_sampler(8, 8)]

train_samples = [{"text": x, "label": y} for x, y in dataset_sampler(64, 8)]
eval_samples = [{"text": x, "label": y} for x, y in dataset_sampler(8, 8)]

In [None]:
train_dict = {k: v for d in train_samples for k, v in d.items()}
eval_dict = {k: v for d in eval_samples for k, v in d.items()}

In [None]:
train_df = pd.DataFrame(train_samples).astype('object')
eval_df = pd.DataFrame(eval_samples).astype('object')
train_df.head()

In [None]:
# train_ds = Dataset.from_dict(train_dict)
# eval_ds = Dataset.from_dict(eval_dict)

train_ds = Dataset.from_pandas(train_df)
eval_ds = Dataset.from_pandas(eval_df)

In [None]:
def preprocess(example):
    return tokenizer(example['text'],
                      is_split_into_words=True,
                      padding=True,
                      truncation=True,
                      max_length=512,
                     )

tokenized_train = train_ds.map(preprocess, batched=True)
tokenized_eval = eval_ds.map(preprocess, batched=True)

columns = ['input_ids', 'token_type_ids', 'label']
tokenized_train.set_format(type='torch', columns=columns)
tokenized_eval.set_format(type='torch', columns=columns)

In [None]:
tokenized_train[0]['input_ids'].shape

## Model

In [None]:
model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base-mlm', num_labels=1)
model.resize_token_embeddings(len(tokenizer))

## Training

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
class CostModelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.MSELoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="../results/cost",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="wandb",
    run_name="codebert-llvm-ic-v0",
)

trainer = CostModelTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()