In [1]:
import os
import re
import torch
from torch import nn
from tqdm import tqdm
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
)
from tokenizers.processors import TemplateProcessing
from trl import SFTConfig, SFTTrainer
from peft import (
    prepare_model_for_kbit_training, 
    LoraConfig, 
    get_peft_model,
    TaskType,
)
from datasets import load_dataset
from accelerate import Accelerator

  from .autonotebook import tqdm as notebook_tqdm


# Îç∞Ïù¥ÌÑ∞ÏÖã Ï†ïÏùò

In [6]:
repo = 'KorQuAD/squad_kor_v2'
tokenizer = AutoTokenizer.from_pretrained(repo)
dataset = load_dataset("csv", data_files="/home/jovyan/work/prj_data/open/train.csv")
max_length = 1280

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.backend_tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A <s> $B </s>",
    special_tokens=[
        ("<s>", tokenizer.convert_tokens_to_ids("<s>")),
        ("</s>", tokenizer.convert_tokens_to_ids("</s>"))
    ],
)

def preprocess_function(examples):
    question, context, answer = examples["question"], examples["context"], examples["answer"]
    def preprocess_text(text):
        text = text.replace('\n', ' ')
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    question = preprocess_text(question)
    context = preprocess_text(context)
    answer = preprocess_text(answer)
    
    inputs = tokenizer(
        question,
        context,
        return_offsets_mapping=True,
        truncation=False, 
        # truncation=True,
        # max_length=max_length, 
        # padding="max_length",
    )
    
    start_char = context.find(answer)
    end_char = start_char + len(answer)

    # offset_mapping: [(token1 start, token1 end), (token2 ~, ), ...]
    offset= inputs.pop("offset_mapping")
    
    # sequence_ids: (token=None, question=0, context=1)
    sequence_ids = inputs.sequence_ids(0)

    # Ïª®ÌÖçÏä§Ìä∏Ïùò ÏãúÏûë Î∞è ÎßàÏßÄÎßâÏùÑ Ï∞æÎäîÎã§.
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # ÎßåÏùº Ï†ïÎãµÏù¥ Ïª®ÌÖçÏä§Ìä∏Ïóê ÏôÑÏ†ÑÌûà Ìè¨Ìï®ÎêòÏßÄ ÏïäÎäîÎã§Î©¥, Î†àÏù¥Î∏îÏùÄ (0, 0)ÏûÑ
    if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_position = 0
        end_position = 0
    else:
        # Í∑∏Î†áÏßÄ ÏïäÏúºÎ©¥ Ï†ïÎãµÏùò ÏãúÏûë Î∞è ÎßàÏßÄÎßâ Ïù∏Îç±Ïä§
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_position = idx - 1

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_position = idx + 1

    inputs["start_positions"] = start_position
    inputs["end_positions"] = end_position
    return inputs

# Îç∞Ïù¥ÌÑ∞ ÌîÑÎ†àÏûÑÏùÑ Ï†ÑÏ≤òÎ¶¨Ìï©ÎãàÎã§
preprocess_function(dataset["train"][0])
# train_dataset = dataset["train"].map(preprocess_function)
train_dataset = train_dataset.remove_columns(['id', 'context', 'question', 'answer'])

Map:   0%|          | 0/33716 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (779 > 512). Running this sequence through the model will result in indexing errors
Map:   3%|‚ñé         | 999/33716 [00:01<00:53, 614.59 examples/s]


TypeError: 'NoneType' object is not subscriptable

In [11]:
test_idx = 15
print(tokenizer.decode(train_dataset[test_idx]["input_ids"][train_dataset[test_idx]["start_positions"] : train_dataset[test_idx]["end_positions"] + 1]))

NameError: name 'train_dataset' is not defined

# Î™®Îç∏ Ï†ïÏùò

In [5]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
print("start")
model = AutoModelForQuestionAnswering.from_pretrained(
        repo,
        quantization_config=quantization_config,
        device_map={"":0},
        torch_dtype="auto",
)
print("end")

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=['up_proj', 
                    'down_proj', 
                    'gate_proj', 
                    'k_proj', 
                    'q_proj', 
                    'v_proj', 
                    'o_proj'],
    task_type="QUESTION_ANSWERING"
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

accelerater = Accelerator()
model, tokenizer = accelerater.prepare(model, tokenizer)
model.config.use_cache = True

start


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [00:11<00:00,  1.22s/it]
Some weights of LlamaForQuestionAnswering were not initialized from the model checkpoint at charlieCs/Open-Solar-ko-10B-dacon-qa and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


end


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


# Loss Ï†ïÏùò

# ÌïôÏäµ

In [7]:
import wandb
wandb.login()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

training_args = TrainingArguments(
    output_dir="test",
    num_train_epochs=1,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True
    save_steps=0.1,
)

# Trainer ÏÑ§Ï†ï
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33muijinkim[0m. Use [1m`wandb login --relogin`[0m to force relogin
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
trainer.train()





Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# Inference

In [2]:
import torch
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
from tqdm import tqdm

CHECK_POINT = "/home/jovyan/work/ai_chat_qa_task/code/huggingface/SOLAR_QA/checkpoint-4215"
TEST_fOLDER = '/home/jovyan/work/prj_data/open/test.csv'
OUTPUT = "test"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
csv = pd.read_csv(TEST_fOLDER)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Î™®Îç∏ Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú
config = PeftConfig.from_pretrained(CHECK_POINT)
model = AutoModelForQuestionAnswering.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(model, CHECK_POINT)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

ValueError: Can't find 'adapter_config.json' at 'KorQuAD/squad_kor_v2'

In [5]:
TEST_fOLDER = '/home/jovyan/work/prj_data/open/train.csv'
csv = pd.read_csv(TEST_fOLDER)
idx = 4

def get_prediction(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=False)
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)
    print(answer_start, answer_end)
    
    answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end+1])
    return answer

predictions = []
i = 0
for _, row in tqdm(csv.iterrows(), total=len(csv)):
    if i == idx:
        answer = get_prediction(row['question'], row['context'])
        predictions.append(answer)
        print(row['question'])
        print("---")
        print(row['context'])
        print("---")
        print(answer)
        break
    i+=1

  0%|          | 4/33716 [00:00<1:02:25,  9.00it/s]

tensor(7, device='cuda:0') tensor(1, device='cuda:0')
PM9A3 E1.SÏùò Ïó∞ÏÜçÏì∞Í∏∞ ÏÜçÎèÑÎäî
---
 ÏÇºÏÑ±Ï†ÑÏûêÍ∞Ä OCP(Ïò§Ìîà Ïª¥Ìì®Ìä∏ ÌîÑÎ°úÏ†ùÌä∏)Ïùò Í∑úÍ≤©ÏùÑ ÎßåÏ°±ÌïòÎäî Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Ï†ÑÏö© Í≥†ÏÑ±Îä• SSD ‚ÄòPM9A3 E1.S‚ÄôÎ•º ÏñëÏÇ∞ÌïúÎã§Í≥† 24Ïùº Î∞ùÌòîÎã§. 

 

 OCPÎäî Í∏ÄÎ°úÎ≤å Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Í¥ÄÎ†® Í∏∞ÏóÖÎì§Ïù¥ Ìö®Ïú®Ï†ÅÏù∏ Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Í∞úÎ∞úÍ≥º Ïö¥ÏòÅÏóê ÌïÑÏöîÌïú ÌïòÎìúÏõ®Ïñ¥ÏôÄ ÏÜåÌîÑÌä∏Ïõ®Ïñ¥Ïùò ÌëúÏ§ÄÏùÑ Ï†ïÎ¶ΩÌïòÎäî Í∏∞Íµ¨Îã§. 

 

 Ïù¥Î≤à Ï†úÌíàÏùÄ ÏóÖÍ≥ÑÏµúÏ¥à 6ÏÑ∏ÎåÄ VÎÇ∏ÎìúÎ•º Í∏∞Î∞òÏúºÎ°ú Ìïú Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Ï†ÑÏö© SSDÎ°ú, OCPÏùò NVMe Cloud SSD ÌëúÏ§ÄÏùÑ ÏßÄÏõêÌïòÎ©∞, Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ÏóêÏÑú ÏöîÍµ¨ÌïòÎäî ÏÑ±Îä•, Ï†ÑÎ†• Ìö®Ïú®, Î≥¥Ïïà Îì±ÏùÑ Í∞ÅÍ∞Å ÏµúÍ≥† ÏàòÏ§ÄÏùò ÏÜîÎ£®ÏÖòÏúºÎ°ú Ï†úÍ≥µÌïúÎã§. 

 

 ÌäπÌûà Ï†ÑÎ†• Ìö®Ïú®Ïù¥ ÏóÖÍ≥Ñ ÏµúÍ≥† ÏàòÏ§ÄÏúºÎ°ú ÎÜíÏïÑ Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Ïö¥ÏòÅ ÎπÑÏö©ÏùÑ Ï†àÍ∞êÌï† Ïàò ÏûàÏúºÎ©∞, ÏµúÍ∑º ÌôîÎëêÍ∞Ä ÎêòÍ≥† ÏûàÎäî ÌÉÑÏÜå Ï†ÄÍ∞ê Ìö®Í≥ºÎèÑ Í∏∞ÎåÄÌï† Ïàò ÏûàÎã§. 

 

 PM9A3 E1.SÏùò Ï†ÑÎ†• Ìö®Ïú®ÏùÄ Ïó∞ÏÜçÏì∞Í∏∞ ÏÑ±Îä•ÏùÑ Í∏∞Ï§ÄÏúºÎ°ú Ìï†


