## Install

In [None]:
!pip3 install transformers datasets sentencepiece langchain peft trl -q
!pip install torch~=2.1.0 --index-url https://download.pytorch.org/whl/cpu -q #Updating torch since we need the latest version
!pip install torch_xla[tpu]~=2.1.0 -f https://storage.googleapis.com/libtpu-releases/index.html -q
!pip uninstall tensorflow -y #If we don't do this, TF will take over TPU and cause permission error for PT
!cp /kaggle/input/utils-xla/spmd_util.py . #From this repo: https://github.com/HeegyuKim/torch-xla-SPMD

In [None]:
!git clone https://github.com/HeegyuKim/torch-xla-SPMD

In [None]:
!cp /kaggle/working/torch-xla-SPMD/spmd_util.py . #From this repo: https://github.com/HeegyuKim/torch-xla-SPMD

## Import

In [1]:
import os
import pandas as pd
import numpy as np
import datasets
import torch.optim as optim
import torch_xla.debug.profiler as xp
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp #We also import mp modules if we wanna use that for some reason
import torch_xla.distributed.parallel_loader as pl
import torch_xla.test.test_utils as test_utils
import torch
import torch.nn as nn
import re
import torch_xla.experimental.xla_sharding as xs
import torch_xla.core.xla_model as xm
from trl import DataCollatorForCompletionOnlyLM
from transformers import (
    GPTNeoXConfig, T5Config, LlamaConfig, AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding, AutoConfig
) # You can use any of models with those configs (even flan T5 xxl!). Other models are not supported.

from transformers import logging as hf_logging
import torch.nn.functional as F
import torch_xla.runtime as xr

xr.use_spmd()

import torch_xla.experimental.xla_sharding as xs # "experimental" prefix always means you're gonna have a good time LMAO
from torch_xla.experimental.xla_sharded_tensor import XLAShardedTensor
from torch_xla.experimental.xla_sharding import Mesh

from peft import LoraConfig, TaskType, get_peft_model # If we wanna use peft. Quantazation requiers GPU though.
# from spmd_util import partition_module                # You could experiment with using already quantazed models like 4bit/Llama-2-7b-Chat-GPTQ if you're feeling funny
from langchain.prompts import PromptTemplate          # Please share your experiements if you find something :)
from datasets import Dataset, load_dataset, concatenate_datasets
from dataclasses import dataclass
from tqdm import tqdm

!export USE_TORCH=True #If we don't do this, transformers will seemingly bork the session upon import. Really weird error.
os.environ["PJRT_DEVICE"] = "TPU"
os.environ.pop('TPU_PROCESS_ADDRESSES')
os.environ.pop('CLOUD_TPU_TASK_ID')
hf_logging.set_verbosity_error() # It can still display warnings which is a bit annoying but whatever

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
import torch
import torch.nn as nn
import re
import torch_xla.experimental.xla_sharding as xs
import torch_xla.core.xla_model as xm
from transformers import (
    GPTNeoXConfig, T5Config, LlamaConfig, MistralConfig
)

# ends with $ to prevent sharding lora parameters
GPTNEOX_RULES = (
    # embeddings
    ("gpt_neox\\.embed_in", ("mp", "fsdp")),
    # atention
    ("attention\\.query_key_value$", ("fsdp", "mp")),
    ("attention\\.dense$", ("mp", "fsdp")),
    # mlp
    ("mlp\\.dense_h_to_4h$", ("fsdp", "mp")),
    ("mlp\\.dense_4h_to_h$", ("mp", "fsdp")),
    # output
    ("embed_out", ("fsdp", "mp")),
)

T5_RULES = (
    # embeddings
    ("shared$", ("mp", "fsdp")),
    ("embed_tokens$", ("mp", "fsdp")),
    
    # attention
    ("q$", ("fsdp", "mp")),
    ("k$", ("fsdp", "mp")),
    ("v$", ("fsdp", "mp")),
    ("o$", ("mp", "fsdp")),

    # mlp
    ("w$", ("fsdp", "mp")),
    ("wi_0$", ("fsdp", "mp")),
    ("wi_1$", ("fsdp", "mp")),
    ("wo$", ("mp", "fsdp")),

    # seq2seq lm head
    ("lm_head", ("fsdp", "mp")),
)

LLAMA_RULES = (
    ("model\\.embed_tokens", ("mp", "fsdp")),
    ("self_attn\\.(q_proj|k_proj|v_proj)", ("fsdp", "mp")),
    ("self_attn\\.o_proj", ("mp", "fsdp")),
    ("mlp\\.gate_proj", ("fsdp", "mp")),
    ("mlp\\.down_proj", ("mp", "fsdp")),
    ("mlp\\.up_proj", ("fsdp", "mp")),
    ("lm_head", ("fsdp", "mp")),
    )
    
ALL_RULES = [
    (GPTNeoXConfig, GPTNEOX_RULES),
    (T5Config, T5_RULES),
    (LlamaConfig, LLAMA_RULES),
    (MistralConfig, LLAMA_RULES)
]

def find_rule(model):
    for config, rule in ALL_RULES:
        if model.config.__class__ == config:
            return rule
    raise Exception("unsupported model to partitioning")

strkey2id = {
    "dp": 0,
    "fsdp": 1,
    "mp": 2
}

def partition_module(model, mesh, device=xm.xla_device(), verbose=False):
    partition_specs = find_rule(model)
    rule = [(k, tuple([strkey2id[x] for x in v])) for k, v in partition_specs]
        
    # print(rule)

    for name, module in model.named_modules():
        module.to(device)
        # print(name, module.__class__.__name__)
        if isinstance(module, (nn.Embedding, nn.Linear)):
            for rule_pattern, spec in rule:
                if re.findall(rule_pattern, name):
                    if verbose:
                        print("match", rule_pattern, name)
                    
                    xs.mark_sharding(module.weight, mesh, spec)
                    break
        
def partition_module_dp(model, mesh, device=xm.xla_device(), verbose=False):
    spec = (1, 2)

    for name, module in model.named_modules():
        module.to(device)
        if isinstance(module, (nn.Embedding, nn.Linear)):
            xs.mark_sharding(module.weight, mesh, spec)

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
# secret_wandb = user_secrets.get_secret("wandb")

In [None]:
!huggingface-cli login --token $secret_hf

In [3]:
base_model = "meta-math/MetaMath-Mistral-7B"
# base_model = "EleutherAI/gpt-neo-2.7B"
# base_model = "gpt2-xl"
# base_model = "gpt2-large"
# base_model = "gpt2"
# base_model = "openchat/openchat_3.5"
new_model = "BK-BigAI-Math"
model_hotamath_path = "/kaggle/working/BK-BigAI-Math"

## Download dataset

In [None]:
!mkdir dataset

In [None]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="hotamago/ZAIC-2023", filename="Elementary Maths Solving/test.zip", revision="main", repo_type="dataset", local_dir="dataset", local_dir_use_symlinks=False)
hf_hub_download(repo_id="hotamago/ZAIC-2023", filename="Elementary Maths Solving/train.zip", revision="main", repo_type="dataset", local_dir="dataset", local_dir_use_symlinks=False)

In [None]:
!sudo apt-get install unzip

In [None]:
!mkdir datasetRaw
!unzip -q -o "dataset/Elementary Maths Solving/test.zip" -d "datasetRaw"
!unzip -q -o "dataset/Elementary Maths Solving/train.zip" -d "datasetRaw"

## Load dataset

In [4]:
import os
import json
import re
import time

In [5]:
train_data = None
test_data = None
with open(os.path.join("datasetRaw", "train", "/kaggle/working/datasetRaw/math_train.json"), "r") as f:
    train_data = json.loads(f.read())['data']
with open(os.path.join("datasetRaw", "test", "/kaggle/working/datasetRaw/math_test.json"), "r") as f:
    test_data = json.loads(f.read())['data']

In [6]:
train_data[0]

{'id': '1',
 'question': 'Một người bán hàng bỏ ra 80,000 đồng tiền vốn và bị lỗ 6%. Để tính số tiền lỗ ta phải tính',
 'choices': ['A. 80,000 : 6',
  'B. 80,000 x 6',
  'C. 80,000 : (6 x 100)',
  'D. (80,000 x 6) : 100'],
 'explanation': 'Theo đề bài, số tiền lỗ bằng 6% của 80 000 đồng . Để tìm số tiền lỗ ta có thể lấy 80 000 chia cho 100 rồi nhân với 6 (tức là 80 000 : 100 × 6) hoặc lấy 80000 nhân với 6 rồi chia cho 100 (tức là 80 000 × 6 : 100).',
 'answer': 'D. (80,000 x 6) : 100'}

In [7]:
test_data[0]

{'id': '01-0203',
 'question': 'Một cửa hàng đã bán 30% số hàng hiện có và thu được 15 000 000 đồng. Hỏi nếu bán hết hàng thì cửa hàng thu được bao nhiêu tiền?',
 'choices': ['A. 4 500 000 đồng',
  'B. 45 000 000 đồng',
  'C. 50 000 000 đồng',
  'D. 450 000 000 đồng']}

In [8]:
MAX_TOKEN_MODEL = 512

In [9]:
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>" # "<|end_of_turn|>" # "</s>" "<|endoftext|>"
DEFAULT_BOS_TOKEN = "<s>" # "<s>" "<|endoftext|>"
DEFAULT_UNK_TOKEN = "<unk>" # "<unk>" "<|endoftext|>"
DEFAULT_BOI_TOKEN = "[INST]" # "Human:" # "[INST]" "<|human|>"
DEFAULT_EOI_TOKEN = "[/INST]" # "Assistant:" # "[/INST]" "<|human|>"
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with the choices, one of the choices is the correct answer to the request. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Choices:\n{choices}"
    ),
    "prompt_input_run": (
        DEFAULT_BOI_TOKEN + " Below is an instruction that describes a task. paired with the choices, one of the choices is the correct answer to the request. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Choices:\n{choices}"
#         "\n" + DEFAULT_EOI_TOKEN + " \n\n"
#         "### Explanation:\n Let's think step by step.\n"
#         "### Explanation:\n Hãy suy nghĩ từng bước một.\n"
    ),
}

In [10]:
timeGlobal = 0
def startTime():
    global timeGlobal
    timeGlobal = time.time()
def getTime():
    return (time.time() - timeGlobal)

In [11]:
def ApplyPromptTemplate(instruction, choices, typeP = "prompt_input"):
    return PROMPT_DICT[typeP].format(instruction = instruction, choices = "\n".join(choices))

### Preprocess data

In [12]:
from datasets import Dataset
import random

In [13]:
num_train_dataset = len(train_data)
valition_radio = 0.1
tokenized_train_dataset_raw = train_data[:-int(num_train_dataset*valition_radio)]
tokenized_val_dataset_raw = train_data[-int(num_train_dataset*valition_radio):]

In [14]:
datasetStruct = {"input":[], "output":[]}
dataset = {"text":[]}
num_train_dataset = len(tokenized_train_dataset_raw)
for i in range(num_train_dataset):
    ttdro = tokenized_train_dataset_raw[i]
    
    if "explanation" not in ttdro.keys():
        continue
    
    input_content = "{0} {1}".format(
        DEFAULT_BOI_TOKEN,
        ApplyPromptTemplate(ttdro['question'], ttdro['choices']),
    )
    datasetStruct["input"].append(input_content)
    
    if "explanation" not in ttdro.keys():
        output_content = "\n{0} \n\n{1}\n\n{2} {3}".format(
            DEFAULT_EOI_TOKEN,
            "### Explanation:\nNo explanation",
            "### Answer:\n{0}".format(ttdro['answer']),
            DEFAULT_EOS_TOKEN,
        )
    else:
        output_content = "\n{0} \n\n{1}\n\n{2} {3}".format(
            DEFAULT_EOI_TOKEN,
            "### Explanation:\n{0}".format(ttdro['explanation']),
            "### Answer:\n{0}".format(ttdro['answer']),
            DEFAULT_EOS_TOKEN,
        )
    datasetStruct["output"].append(output_content)
    
    dataset["text"].append(input_content + output_content)
    
    # No explantion
#     output_content = "\n{0} \n\n{1}\n\n{2} {3}".format(
#             DEFAULT_EOI_TOKEN,
#             "### Explanation:\nNo explanation",
#             "### Answer:\n{0}".format(ttdro['answer']),
#             DEFAULT_EOS_TOKEN,
#         )
#     datasetStruct["output"].append(output_content)
    
#     dataset["text"].append(input_content + output_content)
#     <s>[INST][/INST] </s>

In [15]:
print(dataset["text"][33])

[INST] Below is an instruction that describes a task, paired with the choices, one of the choices is the correct answer to the request. Write a response that appropriately completes the request.

### Instruction:
Nhà Hiền có 7 con lợn. Số ga nhiều hơn số lợn 63 con. Vậy số gà gấp số lợn là:

### Choices:
A. 7 lần
B. 8 lần
C. 9 lần
D. 10 lần
[/INST] 

### Explanation:
Nhà Hiền có số con gà là: 63 + 7 = 70 (con)
 Số gà gấp số lợn là: 70 : 7 = 10 (lần)

### Answer:
D. 10 lần </s>


In [16]:
print(len(dataset["text"]))

1080


## Train model

### Config

In [17]:
FLAGS = {'MAX_INPUT': MAX_TOKEN_MODEL,
         'LOGGING_STEPS': 100,
         'NUM_EPOCHS': 5,
         'BATCH_SIZE': 8, #Making batch_size lower then 8 will result in slower training, but will take more memory. Fortunately, we have 128GBs. Setting higher batch_size doesn't seem to improve time.
          'NUM_STEPS': len(dataset['text'])} 

### Load model

In [18]:
# bnb_config = BitsAndBytesConfig(  
#     load_in_4bit= True,
#     bnb_4bit_quant_type= "nf4",
#     bnb_4bit_compute_dtype= torch.bfloat16,
#     bnb_4bit_use_double_quant= False,
# )
model = AutoModelForCausalLM.from_pretrained(
    base_model,
#         model_hotamath_path,
#     load_in_4bit=True,
#     load_in_8bit= True,
#     quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
#     torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

cnt = 0
for param in model.parameters():
    cnt += 1
    param.requires_grad = False
    if cnt > 270:
        param.requires_grad = True

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.99s/it]


In [19]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    model_max_length=MAX_TOKEN_MODEL,
    padding_side="right",
    use_fast=False,
)
# tokenizer.pad_token = tokenizer.eos_token

In [20]:
special_tokens_dict = {
    'additional_special_tokens': [DEFAULT_BOI_TOKEN, DEFAULT_EOI_TOKEN],
    'pad_token': DEFAULT_PAD_TOKEN,
    'bos_token': DEFAULT_BOS_TOKEN,
    'eos_token': DEFAULT_EOS_TOKEN,
    'unk_token': DEFAULT_UNK_TOKEN,
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(32003, 4096)

In [21]:
print(tokenizer.encode("{0} Hello, how are you? \n{1} I'm fine, thank you!{2}".format(
    DEFAULT_BOI_TOKEN,
    DEFAULT_EOI_TOKEN,
    DEFAULT_EOS_TOKEN,
)))

[1, 32001, 28705, 22557, 28725, 910, 460, 368, 28804, 28705, 13, 32002, 28705, 315, 28742, 28719, 4433, 28725, 6979, 368, 28808, 2]


### Check model

In [22]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32003, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

In [23]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
print_trainable_parameters(model)

trainable params: 698400768 || all params: 7241756672 || trainable%: 9.644079463486287


### Check testcase max token

In [24]:
max_token_of_dataset = 0
listLongToken = []
for i in range(len(dataset["text"])):
    text = dataset["text"][i]
    token_len = len(tokenizer.encode(text))
    max_token_of_dataset = max(token_len, max_token_of_dataset)
    if token_len > MAX_TOKEN_MODEL:
        print(i, token_len, "\n")
        listLongToken.append(i)
#         print(text)
print(max_token_of_dataset)

71 560 

239 530 

361 618 

397 650 

399 554 

400 599 

435 594 

458 574 

463 763 

467 552 

483 515 

484 590 

486 546 

487 650 

494 691 

536 755 

539 697 

540 751 

546 611 

570 659 

582 536 

591 730 

592 546 

603 580 

604 525 

617 522 

622 606 

660 1050 

724 677 

742 584 

754 769 

761 800 

764 563 

766 729 

802 659 

814 789 

847 575 

854 790 

864 752 

873 553 

965 551 

976 626 

998 724 

1000 528 

1013 617 

1024 630 

1026 882 

1028 848 

1044 755 

1051 709 

1053 541 

1057 520 

1073 600 

1050


### Remove long token testcase

In [25]:
print(len(listLongToken))

53


In [26]:
dataset["text"] = [dataset["text"][i] for i in range(len(dataset["text"])) if i not in listLongToken]

In [27]:
# Check
max_token_of_dataset = 0
for i in range(len(dataset["text"])):
    text = dataset["text"][i]
    token_len = len(tokenizer.encode(text))
    max_token_of_dataset = max(token_len, max_token_of_dataset)
    if token_len > MAX_TOKEN_MODEL:
        print(i, token_len, "\n")
print(max_token_of_dataset)

511


### Nice stuct dataset

In [28]:
dataset = Dataset.from_dict(dataset)

In [29]:
def preprocess_dataset_function(example):
    text_tokens = tokenizer(example["text"], truncation=True, max_length=MAX_TOKEN_MODEL, padding='max_length').input_ids #You can try 'padding_to_multiple_of=128'
    return {
        "input_ids": text_tokens,
    }

dataset = dataset.map(preprocess_dataset_function, batched=False, remove_columns=['text'], num_proc=96)
dataset

Map (num_proc=96): 100%|██████████| 1027/1027 [00:00<00:00, 1039.59 examples/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Dataset({
    features: ['input_ids'],
    num_rows: 1027
})

### DataCollator

In [30]:
from trl import DataCollatorForCompletionOnlyLM

In [31]:
instruction_template = DEFAULT_BOI_TOKEN
response_template = DEFAULT_EOI_TOKEN
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)

### XLA Trick

In [32]:
training_loader = torch.utils.data.DataLoader(dataset, batch_size=FLAGS['BATCH_SIZE'], collate_fn=collator)
device = xm.xla_device()

In [33]:
model.config.__class__

transformers.models.mistral.configuration_mistral.MistralConfig

In [41]:
config = AutoConfig.from_pretrained(base_model)
num_devices = xr.global_runtime_device_count()
mesh_shape = (1, num_devices, 1)
device_ids = np.array(range(num_devices))
mesh = Mesh(device_ids, mesh_shape, ('dp', 'fsdp', 'mp'))
partition_module(model, mesh) # After this, the model is sharded between cores but still has the same API as if it was on single device. Neat.

In [42]:
!export XLA_USE_BF16=1 #I'm not even sure that exporting does anything
def train(FLAGS):
    num_iterations = int(FLAGS['NUM_STEPS'] / FLAGS['BATCH_SIZE'])
    lr = 1e-5
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=FLAGS['NUM_STEPS'] * FLAGS['BATCH_SIZE']) #You would probably wanna use cosine scheduler or something it's really easy to change
    for epoch in range(1, FLAGS['NUM_EPOCHS'] + 1):
        model.train()
        xm.master_print('Epoch {} train begin {}'.format(epoch, test_utils.now())) # master print is meant to be used inside xmp function to not have it printed 8 times but whatever
        for step, batch in enumerate(training_loader):
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch.input_ids.to(device), batch.attention_mask.to(device), batch.labels.to(device)
            xs.mark_sharding(input_ids, mesh, (0, 1))
            xs.mark_sharding(attention_mask, mesh, (0, 1))
            xs.mark_sharding(labels, mesh, (0, 1))
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            xm.mark_step()
            if (step + 1) % FLAGS['LOGGING_STEPS'] == 0:
                print(f'loss: {loss.item()}, time: {test_utils.now()}, step: {step}')
            scheduler.step()
        xm.master_print('Epoch {} train end {}'.format(epoch, test_utils.now()))

### Train

In [43]:
train(FLAGS) # "unlimited power" palpatine meme

Epoch 1 train begin 21:08:47
loss: 0.39653581380844116, time: 21:11:45, step: 99
Epoch 1 train end 21:12:39
Epoch 2 train begin 21:12:39
loss: 0.22950127720832825, time: 21:13:38, step: 99
Epoch 2 train end 21:13:46
Epoch 3 train begin 21:13:46
loss: 0.14491340517997742, time: 21:14:54, step: 99
Epoch 3 train end 21:15:02
Epoch 4 train begin 21:15:02
loss: 0.08223885297775269, time: 21:16:10, step: 99
Epoch 4 train end 21:16:18
Epoch 5 train begin 21:16:18
loss: 0.058491867035627365, time: 21:17:27, step: 99
Epoch 5 train end 21:17:34


In [None]:
model = model.cpu()

In [None]:
model.save_pretrained(new_model)
model.config.to_json_file(os.path.join(new_model, "config.json"))
tokenizer.save_pretrained(new_model)

In [None]:
!zip -r BK-BigAI-Math.zip BK-BigAI-Math

In [None]:
from IPython.display import FileLink
FileLink(r'BK-BigAI-Math.zip')

## Evalution

### load model by GPU

In [None]:
dataset = Dataset.from_dict(dataset)

In [None]:
model_name_or_path = "/kaggle/working/BK-BigAI-Math"

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
#     torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
#     load_in_4bit=True,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    model_max_length=512,
    padding_side="right",
    use_fast=False,
)

In [None]:
model.resize_token_embeddings(len(tokenizer))

### Init function evalution

In [47]:
from transformers import pipeline

In [48]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.01,
    top_p=0.3,
    top_k=5,
    repetition_penalty=1.1,
    pad_token_id=tokenizer.eos_token_id
)

In [46]:
import random
# globalRegxCompire = "0-9a-zA-Z\.\:\-\^\! "
def niceValueToCompire(x):
#     x = re.sub("[^{0}]".format(globalRegxCompire), "", x)
    x = re.sub("[ \t\n]", "", x)
    return x
def autoLLMFormat(question, choises = None, debug=False):
    prompt_template = ApplyPromptTemplate(question, choises, "prompt_input_run")
    res = pipe(prompt_template)[0]['generated_text']
    if debug:
        print(res)
    x = re.findall("### Answer:[\n ](.+)", res)
    
    if choises == None:
        return x
    
    choises_compare = [niceValueToCompire(choise_pred) for choise_pred in choises]

    if len(x) == 0:
        return choises[random.randrange(0, len(choises))]
    
    x = niceValueToCompire(x[0])
    
    if (x not in choises_compare):
        return choises[random.randrange(0, len(choises))]
    
    for i in range(len(choises_compare)):
        if x == choises_compare[i]:
            return choises[i]
    
    return "wtf"

### Run

In [49]:
count_proc_testcase = 0
count_pass_testcase = 0

In [50]:
while count_proc_testcase < len(tokenized_val_dataset_raw):
    tvdo = tokenized_val_dataset_raw[count_proc_testcase]
    startTime()
    answer = autoLLMFormat(tvdo['question'], tvdo['choices'], True)
    deltaTime = getTime()

    if answer == tvdo['answer']:
        count_pass_testcase += 1
    
    count_proc_testcase += 1
    print("Testcase {0}, time: {1}, answer: {2} | {3}, Passed: {4}, IsPass: {5}".format(
        count_proc_testcase,
        deltaTime,
        answer,
        tvdo['answer'],
        count_pass_testcase,
        (answer == tvdo['answer'])
    ))



KeyboardInterrupt: 

## Run public test dataset

In [None]:
result_test = []
if os.path.exists(os.path.join("result", "result.txt")):
    with open(os.path.join("result", "result.txt"), "r") as f:
        result_test = f.read().split("\n")
count_id = len(result_test)

In [None]:
!mkdir result

In [None]:
print(len(test_data))

In [None]:
while count_id < len(test_data):
    one_test_data = test_data[count_id]
    startTime()
    answer = autoLLMFormat(one_test_data['question'], one_test_data['choices'], False)
    deltaTime = getTime()
    result_test.append("{0}".format(answer))
    count_id += 1
    if count_id%10 == 0:
        with open(os.path.join("result", "result.txt"), "w", encoding='utf-8') as f:
            f.write("\n".join(result_test))
    print("Testcase {0}, time: {1}, answer: {2}".format(count_id, deltaTime, answer))

In [None]:
with open(os.path.join("result", "result.txt"), "w", encoding='utf-8') as f:
    f.write("\n".join(result_test))

In [None]:
print("\n".join(result_test))

## Convert json to csv

In [None]:
!pip install pandas

In [None]:
import pandas as pd

In [None]:
json_result = []
for i in range(len(test_data)):
    one_test_data = test_data[i]
    json_result.append({
        "id": one_test_data["id"],
        "answer": result_test[i]
    })

In [None]:
import json
json_result_str = json.dumps(json_result)

In [None]:
df = pd.read_json(json_result_str)
df.to_csv(os.path.join("result", "result.csv"))