In [1]:
import pandas as pd
import json
from transformers.models.deberta_v2.modeling_deberta_v2 import (
    DebertaV2PreTrainedModel,
    DebertaV2Model,
    SequenceClassifierOutput,
)
from transformers import AutoModel, PretrainedConfig
from transformers import AutoTokenizer

import torch
from transformers import DebertaV2Tokenizer

In [2]:
class CFG:
    TEST_CSV = "../data/test.csv"
    TEST_CSV = "../data/train.csv"
    SUB_CSV = "../data/sample_submission.csv"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = "llm-blender/PairRM-hf"

### Prepare data

In [3]:
df = pd.read_csv(CFG.TEST_CSV)
df

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0
...,...,...,...,...,...,...,...,...,...
57472,4294656694,gpt-4-0613,claude-1,"[""A simple mnemonic for \u03c0:\n\""How I wish ...","[""Sure, let's break it down:\n\n1. \""How\"" has...","[""Here is how that mnemonic represents the dig...",1,0,0
57473,4294692063,claude-2.0,llama-2-13b-chat,"[""In python, implement a naive Bayes with gaus...","[""Here is an implementation of a naive Bayes c...","[""Sure! Here's an implementation of a naive Ba...",1,0,0
57474,4294710549,claude-1,alpaca-13b,"[""is it unethical to work on building weapons?...","[""Working on weapons technology raises some et...","[""It depends on the context. Weapons can be us...",1,0,0
57475,4294899228,palm-2,tulu-2-dpo-70b,"[""If a bait contains 0,0025% bromadiolon then ...","[""Bromadiolone is a rodenticide which is most ...","[""As an AI language model, I do not promote or...",0,1,0


In [4]:
def safe_eval(s):
    try:
        return json.loads(s.replace("null", '""'))
    except (ValueError, SyntaxError, json.JSONDecodeError):
        # If parsing fails, return the input itself if it's already a list or return an empty list
        if isinstance(s, list):
            return s
        elif isinstance(s, int):
            return [str(s)]
        else:
            return []


# Convert each element to string before joining
def join_elements(elements):
    return " ".join(map(str, elements))


df["prompt"] = df.prompt.map(lambda x: join_elements(safe_eval(x)))
df["response_a"] = df.response_a.map(
    lambda x: join_elements(safe_eval(x.replace("null", "''")))
)
df["response_b"] = df.response_b.map(
    lambda x: join_elements(safe_eval(x.replace("null", "''")))
)

In [5]:
def make_pairs(row):
    prompt = row.prompt.encode("utf-8", errors="ignore").decode("utf-8")
    response_a = row.response_a.encode("utf-8", errors="ignore").decode("utf-8")
    response_b = row.response_b.encode("utf-8", errors="ignore").decode("utf-8")
    row["left"] = f"<|source|> {prompt}\n\n\n\n <|candidate|> {response_a}"
    row["right"] = f"<|source|> {prompt}\n\n\n\n <|candidate|> {response_b}"

    return row

In [6]:
df = df.apply(make_pairs, axis=1)

In [7]:
df

Unnamed: 0,id,prompt,response_a,response_b,left,right
0,136060,"I have three oranges today, I ate an orange ye...",You have two oranges today.,You still have three oranges. Eating an orange...,"<|source|> I have three oranges today, I ate a...","<|source|> I have three oranges today, I ate a..."
1,211333,You are a mediator in a heated political debat...,Thank you for sharing the details of the situa...,Mr Reddy and Ms Blue both have valid points in...,<|source|> You are a mediator in a heated poli...,<|source|> You are a mediator in a heated poli...
2,1233961,How to initialize the classification head when...,When you want to initialize the classification...,To initialize the classification head when per...,<|source|> How to initialize the classificatio...,<|source|> How to initialize the classificatio...


In [8]:
class TransformerDataset(torch.utils.data.Dataset):
    def __init__(self, ids, left, right, tokenizer, max_length):
        self.ids = ids
        self.left = left
        self.right = right
        self.tokenizer = tokenizer
        self.max_length = max_length

    def get_encoded(self, text):
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
        )

        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        padding_lenght = self.max_length - len(ids)

        ids = ids + ([0] * padding_lenght)
        mask = mask + ([0] * padding_lenght)
        token_type_ids = token_type_ids + ([0] * padding_lenght)

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }

    def __getitem__(self, idx):
        left = self.left[idx]
        right = self.right[idx]
        ids = self.ids[idx]

        return {
            "left": self.get_encoded(left),
            "right": self.get_encoded(right),
            "ids": ids,
        }

    def __len__(self):
        return len(self.ids)

### Load model

In [10]:
class TransformerModel(torch.nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        default_deberta_config = {
            "model_type": "deberta-v2",
            "attention_probs_dropout_prob": 0.1,
            "hidden_act": "gelu",
            "hidden_dropout_prob": 0.1,
            "hidden_size": 1024,
            "initializer_range": 0.02,
            "intermediate_size": 4096,
            "max_position_embeddings": 512,
            "relative_attention": True,
            "position_buckets": 256,
            "norm_rel_ebd": "layer_norm",
            "share_att_key": True,
            "pos_att_type": "p2c|c2p",
            "layer_norm_eps": 1e-7,
            "max_relative_positions": -1,
            "position_biased_input": True,
            "num_attention_heads": 16,
            "num_hidden_layers": 24,
            "type_vocab_size": 0,
            "vocab_size": 128005,
            "n_tasks": 1,
            "drop_out": 0,
            "sep_token_id": 2,
            "source_prefix_id": 0,
            "cand_prefix_id": 3,
            "cand1_prefix_id": 3,
            "cand2_prefix_id": 4,
        }
        config = PretrainedConfig.from_dict(default_deberta_config)
        self.backbone = DebertaV2Model(config=config)

        # get the hidden size of the transformer
        self.hidden_size = self.backbone.config.hidden_size

        self.head_layer = torch.nn.Sequential(
            torch.nn.Dropout(0),
            torch.nn.Linear(2 * self.hidden_size, 1 * self.hidden_size),
            torch.nn.Tanh(),
            torch.nn.Dropout(0),
            torch.nn.Linear(1 * self.hidden_size, n_classes),
        )

    def forward(self, left, right):
        l_pooled_output = self.backbone(
            left["input_ids"],
            attention_mask=left["attention_mask"].cuda(),
            token_type_ids=left["token_type_ids"].cuda(),
            return_dict=False,
        )

        r_pooled_output = self.backbone(
            right["input_ids"],
            attention_mask=right["attention_mask"].cuda(),
            token_type_ids=right["token_type_ids"].cuda(),
            return_dict=False,
        )

        embeds = torch.cat((l_pooled_output[0], r_pooled_output[0]), dim=-1)
        embeds = torch.mean(embeds, dim=1).type(torch.float32)

        output = self.head_layer(embeds)

        return output

In [11]:
model = TransformerModel(3).cuda().eval()

In [12]:
# load microsoft/deberta-v3-large without loading weights

model.load_state_dict(
    torch.load("../logs/20240628-010538-backend-None-lr-1e-06/model.pth"),
    strict=False,
)

_IncompatibleKeys(missing_keys=['backbone.embeddings.position_embeddings.weight'], unexpected_keys=[])

In [23]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

# save tokenizer
tokenizer.save_pretrained("../models/pairrm-hf-tokenizer")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
dataset = TransformerDataset(
    df["id"].to_list(),
    df["left"].to_list(),
    df["right"].to_list(),
    tokenizer,
    512,
)

dataloader_train = torch.utils.data.DataLoader(
    dataset, batch_size=16, shuffle=False, num_workers=4
)

In [15]:
outputs = []

In [16]:
for batch in dataloader_train:
    ids = batch["ids"]
    left = batch["left"]
    right = batch["right"]
    output = model(
        {
            "input_ids": batch["left"]["input_ids"].to(CFG.device),
            "attention_mask": batch["left"]["attention_mask"].to(CFG.device),
            "token_type_ids": batch["left"]["token_type_ids"].to(CFG.device),
        },
        {
            "input_ids": batch["right"]["input_ids"].to(CFG.device),
            "attention_mask": batch["right"]["attention_mask"].to(CFG.device),
            "token_type_ids": batch["right"]["token_type_ids"].to(CFG.device),
        },
    )
    output = torch.nn.functional.softmax(output, dim=1)

    for i, o in zip(ids, output):
        outputs.append([i.item(), o[0].item(), o[1].item(), o[2].item()])

In [17]:
outputs

[[136060, 0.012044858187437057, 0.8154296875, 0.17252545058727264],
 [211333, 0.00034956890158355236, 0.023133503273129463, 0.976516842842102],
 [1233961, 0.9975916147232056, 2.8982574804103933e-05, 0.002379420679062605]]

In [18]:
submit_df = pd.read_csv(CFG.SUB_CSV)

In [19]:
# delete all rows
submit_df.drop(submit_df.index, inplace=True)

In [20]:
submit_df

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie


In [21]:
# put the outputs into the submit_df
submit_df = pd.DataFrame(outputs, columns=submit_df.columns)

In [22]:
submit_df

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.012045,0.81543,0.172525
1,211333,0.00035,0.023134,0.976517
2,1233961,0.997592,2.9e-05,0.002379
