In [1]:
import glm3_tokenizer
import torch

from typing import Any, Union, List

class CMMLUTokenizer:
    def __init__(self, real_tokenizer: glm3_tokenizer.GLM3Tokenizer) -> None:
        self._tokenizer = real_tokenizer

    def encode(self, string: str, return_tensors="pt"):
        return self._tokenizer.encode_one_no_attn_no_special(string).tolist()
    
    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]):
        if isinstance(tokens, str):
            tokens = [tokens]
        
        return [int(self._tokenizer.encode_one_no_attn_no_special(token)[0]) for token in tokens]
    
    def __call__(self, string, return_tensors="pt") -> Any:
        return self._tokenizer.encode(string, parse_special_tokens=False)

_glm3_tokenizer = glm3_tokenizer.GLM3Tokenizer()
cmmlu_tokenizer = CMMLUTokenizer(_glm3_tokenizer)
cmmlu_tokenizer.convert_tokens_to_ids(["A"])
# cmmlu_tokenizer(["a b c", "1 2 3"])

[320]

In [2]:
import custom_model
import os
import safetensors.torch
import torch

config = custom_model.CustomModelConfig(
            vocab_size=_glm3_tokenizer.vocab_size(),
            padding_token_id=_glm3_tokenizer.token_pad_id,
            max_position_embeddings=4096,
            hidden_size=704,
            num_heads=16,
            MLP_intermediate=5000,
            num_layers=28,
            attention_dropout=0.1,
            dtype=torch.bfloat16,
            training=True,
            linear_imp = torch.nn.Linear
        )

model = custom_model.CustomLLamaModel(config).to(device='cuda')
model_dir = r"/mnt/sata2tb/leo/NLP/small_pretrained_model/saves_medium/checkpoint-751346922-751.35M"
model_path = os.path.join(model_dir, "model.safetensor")
safetensors.torch.load_model(model, model_path)
model.eval()

In [3]:
a = torch.tensor([[.1, .2, .3, .4], [.1, .3, .3, .4], [.5, .2, .3, .4]])
a[:, [0, 1]]

tensor([[0.1000, 0.2000],
        [0.1000, 0.3000],
        [0.5000, 0.2000]])

In [4]:
import CMMLU.mp_utils
from CMMLU.mp_utils import choices, format_example, gen_prompt, softmax
# from CMMLU.hf_causal_model import eval
from typing import Any
from pandas import DataFrame
import numpy as np

#def eval(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot):
# def eval(*args, **kwargs):
#     print(repr(args))
#     #print(repr(kwargs))
#     for k, v in kwargs.items():
#         print(f"{k}: {type(v)}")

# model: <class 'NoneType'>
# tokenizer: <class 'NoneType'>
# subject: <class 'str'>
# dev_df: <class 'pandas.core.frame.DataFrame'>
# test_df: <class 'pandas.core.frame.DataFrame'>
# num_few_shot: <class 'int'>
# max_length: <class 'int'>
# cot: <class 'bool'>
def eval(
        model: Any,
        tokenizer: Any,
        subject: str,
        dev_df: DataFrame,
        test_df: DataFrame,
        num_few_shot: int,
        max_length: int,
        cot: bool # not used
        ):
    choice_ids = [tokenizer.convert_tokens_to_ids(choice)[0] for choice in CMMLU.mp_utils.choices]
    cors = []
    all_conf = [] # confidence
    all_preds = [] # predicates
    answers = CMMLU.mp_utils.choices[: test_df.shape[1] - 2]

    batch_size = 20
    bucket = []
    def run_and_cleanup_bucket() -> None:
        nonlocal bucket, all_preds, all_conf
        if len(bucket) == 0:
            return
        prompts = [prompt for prompt, label in bucket]
        labels = [label for prompt, label in bucket]
        bucket = []
        inputs = tokenizer(prompts).to("cuda")
        with torch.no_grad():
            outputs = model(**inputs)
            last_token_logits = outputs[:, -1, :].to(dtype=torch.float32) # shape: [bz, vocab_size]
            choice_logits: torch.Tensor = last_token_logits[:, choice_ids]
            index_of_labels = torch.tensor([choices.index(label) for label in labels], dtype=torch.int64)[:, None].to('cuda')
            confidences = torch.nn.functional.softmax(choice_logits, dim=-1)
            confidences = confidences.gather(dim=-1, index=index_of_labels).view(-1)
            predictions = [choices[pred_index] for pred_index in choice_logits.argmax(dim=-1).tolist()]
        all_preds.extend(predictions)
        all_conf.extend(confidences.tolist())
        cors.extend([pred==label for pred, label in zip(predictions, labels)])

    for i in range(test_df.shape[0]):
        prompt_end = format_example(test_df, i, subject, include_answer=False)
        prompt = gen_prompt(dev_df=dev_df,
                            subject=subject,
                            prompt_end=prompt_end,
                            num_few_shot=num_few_shot,
                            tokenizer=tokenizer,
                            max_length=max_length)
        label = test_df.iloc[i, test_df.shape[1] - 1]
        bucket.append((prompt, label))

        if len(bucket) >= batch_size:
            run_and_cleanup_bucket()

    run_and_cleanup_bucket()

    acc = np.mean(cors)
    print("Average accuracy {:.3f} - {}".format(acc, subject))
    return acc, all_preds, all_conf

    # print("answers", answers)

In [5]:
import CMMLU.hf_causal_model
import argparse
import random

parser = argparse.ArgumentParser()
parser.add_argument("--model_name_or_path", type=str, default="")
parser.add_argument("--lora_weights", type=str, default="")
parser.add_argument("--data_dir", type=str, default="CMMLU/data")
parser.add_argument("--save_dir", type=str, default="CMMLU/results/not_specified")
parser.add_argument("--num_few_shot", type=int, default=0)
parser.add_argument("--max_length", type=int, default=2048)
parser.add_argument("--load_in_8bit", action='store_true')
parser.add_argument("--with_conf", action='store_true')
parser.add_argument("--cot", action='store_true')
args = parser.parse_args([])

random.seed(233)

CMMLU.mp_utils.run_eval(
    model, cmmlu_tokenizer,
    eval,
    args
)

  from .autonotebook import tqdm as notebook_tqdm


Average accuracy 0.249 - agronomy
Average accuracy 0.270 - anatomy
Average accuracy 0.238 - ancient_chinese
Average accuracy 0.250 - arts
Average accuracy 0.261 - astronomy
Average accuracy 0.287 - business_ethics
Average accuracy 0.263 - chinese_civil_service_exam
Average accuracy 0.237 - chinese_driving_rule
Average accuracy 0.206 - chinese_food_culture
Average accuracy 0.252 - chinese_foreign_policy
Average accuracy 0.254 - chinese_history
Average accuracy 0.250 - chinese_literature
Average accuracy 0.257 - chinese_teacher_qualification
Average accuracy 0.228 - clinical_knowledge
Average accuracy 0.274 - college_actuarial_science
Average accuracy 0.355 - college_education
Average accuracy 0.321 - college_engineering_hydrology
Average accuracy 0.231 - college_law
Average accuracy 0.200 - college_mathematics
Average accuracy 0.189 - college_medical_statistics
Average accuracy 0.275 - college_medicine
Average accuracy 0.265 - computer_science
Average accuracy 0.240 - computer_security
