In [2]:
import os
import re
import torch
import pandas as pd
from torch import nn
from tqdm import tqdm
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
)
from tokenizers.processors import TemplateProcessing
from trl import SFTConfig, SFTTrainer
from peft import (
    prepare_model_for_kbit_training, 
    LoraConfig, 
    get_peft_model,
    TaskType,
)
from datasets import load_dataset
from accelerate import Accelerator

  from .autonotebook import tqdm as notebook_tqdm


# Îç∞Ïù¥ÌÑ∞ÏÖã Ï†ïÏùò

In [2]:
repo = 'yjgwak/klue-bert-base-finetuned-squad-kor-v1'
tokenizer = AutoTokenizer.from_pretrained(repo)
dataset = load_dataset("csv", data_files="/home/jovyan/work/prj_data/open/train.csv")
max_length = 512
stride = 128

def preprocess_function(examples):
    questions, contexts, answers = examples["question"], examples["context"], examples["answer"]
    def preprocess_text(text):
        text = text.replace('\n', ' ')
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    questions = list(map(preprocess_text, questions))
    contexts = list(map(preprocess_text, contexts))
    answers = list(map(preprocess_text, answers))

    inputs = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # offset_mapping: [(token1 start, token1 end), (token2 ~, ), ...]
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs['overflow_to_sample_mapping']# inputs.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]

        start_char = contexts[sample_idx].find(answer)
        end_char = start_char + len(answer)
        # sequence_ids: (token=None, question=0, context=1)
        sequence_ids = inputs.sequence_ids(i)

        # Ïª®ÌÖçÏä§Ìä∏Ïùò ÏãúÏûë Î∞è ÎßàÏßÄÎßâÏùÑ Ï∞æÎäîÎã§.
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # ÎßåÏùº Ï†ïÎãµÏù¥ Ïª®ÌÖçÏä§Ìä∏Ïóê ÏôÑÏ†ÑÌûà Ìè¨Ìï®ÎêòÏßÄ ÏïäÎäîÎã§Î©¥, Î†àÏù¥Î∏îÏùÄ (0, 0)ÏûÑ
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Í∑∏Î†áÏßÄ ÏïäÏúºÎ©¥ Ï†ïÎãµÏùò ÏãúÏûë Î∞è ÎßàÏßÄÎßâ Ïù∏Îç±Ïä§
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)
    
            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Îç∞Ïù¥ÌÑ∞ ÌîÑÎ†àÏûÑÏùÑ Ï†ÑÏ≤òÎ¶¨Ìï©ÎãàÎã§
# preprocess_function(dataset["train"][6])# [15281]
train_dataset = dataset["train"].map(
    preprocess_function,
    batched = True,
    remove_columns=dataset["train"].column_names,
)

# roBERTaÏóêÏÑúÎäî ÏÇ≠Ï†ú, BERTÏóêÏÑúÎäî Ï§ëÏöîÌïú Ïó≠Ìï†
# train_dataset = train_dataset.remove_columns("token_type_ids")

In [3]:
idx = 9
sample_idx = train_dataset["overflow_to_sample_mapping"][idx]
answer = dataset['train']['answer'][sample_idx]

start = train_dataset['start_positions'][idx]
end = train_dataset['end_positions'][idx]
labeled_answer = tokenizer.decode(train_dataset["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")
print(tokenizer.decode(train_dataset["input_ids"][idx]))

Theoretical answer: 6ÏÑ∏ÎåÄ, labels give: 6ÏÑ∏ÎåÄ
[CLS] PM9A3 E1. SÍ∞Ä Í∏∞Î∞òÌïú VÎÇ∏ÎìúÎäî Î™á ÏÑ∏ÎåÄÏù∏Í∞Ä [SEP] ÏÇºÏÑ±Ï†ÑÏûêÍ∞Ä OCP ( Ïò§Ìîà Ïª¥Ìì®Ìä∏ ÌîÑÎ°úÏ†ùÌä∏ ) Ïùò Í∑úÍ≤©ÏùÑ ÎßåÏ°±ÌïòÎäî Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Ï†ÑÏö© Í≥†ÏÑ±Îä• SSD ‚Äò PM9A3 E1. S ‚Äô Î•º ÏñëÏÇ∞ÌïúÎã§Í≥† 24Ïùº Î∞ùÌòîÎã§. OCPÎäî Í∏ÄÎ°úÎ≤å Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Í¥ÄÎ†® Í∏∞ÏóÖÎì§Ïù¥ Ìö®Ïú®Ï†ÅÏù∏ Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Í∞úÎ∞úÍ≥º Ïö¥ÏòÅÏóê ÌïÑÏöîÌïú ÌïòÎìúÏõ®Ïñ¥ÏôÄ ÏÜåÌîÑÌä∏Ïõ®Ïñ¥Ïùò ÌëúÏ§ÄÏùÑ Ï†ïÎ¶ΩÌïòÎäî Í∏∞Íµ¨Îã§. Ïù¥Î≤à Ï†úÌíàÏùÄ ÏóÖÍ≥ÑÏµúÏ¥à 6ÏÑ∏ÎåÄ VÎÇ∏ÎìúÎ•º Í∏∞Î∞òÏúºÎ°ú Ìïú Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Ï†ÑÏö© SSDÎ°ú, OCPÏùò NVMe Cloud SSD ÌëúÏ§ÄÏùÑ ÏßÄÏõêÌïòÎ©∞, Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ÏóêÏÑú ÏöîÍµ¨ÌïòÎäî ÏÑ±Îä•, Ï†ÑÎ†• Ìö®Ïú®, Î≥¥Ïïà Îì±ÏùÑ Í∞ÅÍ∞Å ÏµúÍ≥† ÏàòÏ§ÄÏùò ÏÜîÎ£®ÏÖòÏúºÎ°ú Ï†úÍ≥µÌïúÎã§. ÌäπÌûà Ï†ÑÎ†• Ìö®Ïú®Ïù¥ ÏóÖÍ≥Ñ ÏµúÍ≥† ÏàòÏ§ÄÏúºÎ°ú ÎÜíÏïÑ Îç∞Ïù¥ÌÑ∞ÏÑºÌÑ∞ Ïö¥ÏòÅ ÎπÑÏö©ÏùÑ Ï†àÍ∞êÌï† Ïàò ÏûàÏúºÎ©∞, ÏµúÍ∑º ÌôîÎëêÍ∞Ä ÎêòÍ≥† ÏûàÎäî ÌÉÑÏÜå Ï†ÄÍ∞ê Ìö®Í≥ºÎèÑ Í∏∞ÎåÄÌï† Ïàò ÏûàÎã§. PM9A3 E1. SÏùò Ï†ÑÎ†• Ìö®Ïú®ÏùÄ Ïó∞ÏÜçÏì∞Í∏∞ ÏÑ±Îä•ÏùÑ Í∏∞Ï§ÄÏúºÎ°ú

# Î™®Îç∏ Ï†ïÏùò

In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
print("start")
model = AutoModelForQuestionAnswering.from_pretrained(
        repo,
        device_map="cuda:0",
        torch_dtype=torch.float32,
        quantization_config=quantization_config,
)
print("end")
model.resize_token_embeddings(len(tokenizer))

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key", "value"],
    task_type="QUESTION_ANSWERING"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

accelerater = Accelerator()
model, tokenizer = accelerater.prepare(model, tokenizer)

start


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


end


# Loss Ï†ïÏùò

# ÌïôÏäµ

In [4]:
import wandb
wandb.login()

torch.cuda.empty_cache()
training_args = TrainingArguments(
    output_dir="BERT",
    evaluation_strategy="no",
    num_train_epochs=3,
    save_steps=0.1,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    max_grad_norm=1.0,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=False
)

# Trainer ÏÑ§Ï†ï
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33muijinkim[0m. Use [1m`wandb login --relogin`[0m to force relogin
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
trainer.train()





Step,Training Loss
10,4.4678
20,3.9352
30,3.9121
40,3.3505
50,3.7749
60,4.3121
70,3.7168
80,3.3343
90,3.3564
100,2.8424




TrainOutput(global_step=8991, training_loss=1.051508852569986, metrics={'train_runtime': 9374.7496, 'train_samples_per_second': 15.342, 'train_steps_per_second': 0.959, 'total_flos': 3.797296535699251e+16, 'train_loss': 1.051508852569986, 'epoch': 3.0})

# Inference

In [1]:
import torch
from torch import nn
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
from tqdm import tqdm

CHECK_POINT = "/home/jovyan/work/ai_chat_qa_task/code/huggingface/BERT/checkpoint-8991"
TEST_fOLDER = '/home/jovyan/work/prj_data/open/test.csv'
OUTPUT = "test"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
csv = pd.read_csv(TEST_fOLDER)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Î™®Îç∏ Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú
config = PeftConfig.from_pretrained(CHECK_POINT)
model = AutoModelForQuestionAnswering.from_pretrained(
    config.base_model_name_or_path,
    # quantization_config=quantization_config,
    device_map="cuda:0",
    torch_dtype=torch.float16
)
# model = PeftModel.from_pretrained(model, CHECK_POINT)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
# tokenizer.add_tokens(['ÏúÑÏ±ó', 'ÎåÄÎßå ÏÑúÎ∂Ä', 'Î®ÄÏò§Î¶¨ÌòÑ', '(Áà≠Ë®ü)'])
# model.resize_token_embeddings(len(tokenizer))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import re
TEST_fOLDER = '/home/jovyan/work/prj_data/open/test.csv'
csv = pd.read_csv(TEST_fOLDER)

def get_prediction(question, context):
    inputs = tokenizer(
        question,
        context,
        max_length=512,
        return_tensors="pt",
        truncation="only_second",
        stride=256,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    # inputs.pop("token_type_ids")
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    
    with torch.no_grad():
        outputs = model(**inputs)

    # Î™®Îì† Ï≤≠ÌÅ¨Ïóê ÎåÄÌïú start/end Î°úÏßì Í∞ÄÏ†∏Ïò§Í∏∞
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    
    # Í∞ÄÏû• ÎÜíÏùÄ Ï†êÏàòÏùò ÎãµÎ≥Ä Ï∞æÍ∏∞
    max_answer_score = -float('inf')
    best_answer = ""

    for i in range(len(start_logits)):
        start_indexes = torch.argsort(start_logits[i], descending=True)[:20]
        end_indexes = torch.argsort(end_logits[i], descending=True)[:20]
        
        for start_index in start_indexes:
            for end_index in end_indexes:
                # ÎãµÎ≥ÄÏùò Í∏∏Ïù¥Î•º 50 ÌÜ†ÌÅ∞ÏúºÎ°ú Ï†úÌïú
                if end_index < start_index or end_index - start_index + 1 > 50 or end_index - start_index <= 1:
                    continue
                # ÎãµÎ≥ÄÏù¥ CLSÏùºÎïåÎäî Ï†úÏô∏
                if start_index==0 or end_index==0:
                    continue
                
                answer_score = start_logits[i][start_index] + end_logits[i][end_index]
                if answer_score > max_answer_score:
                    max_answer_score = answer_score
                    best_answer = tokenizer.decode(inputs["input_ids"][i][start_index:end_index+1])
                answer = tokenizer.decode(inputs["input_ids"][i][start_index:end_index+1])
    return best_answer
    
submission_dict = {}
for _, row in tqdm(csv.iterrows(), total=len(csv)):
    answer = get_prediction(row['question'], row['context'])
    
    def clean_prediction(text):
        special_tokens = list(tokenizer.special_tokens_map.values())
        pattern = '|'.join(map(re.escape, special_tokens))
        cleaned_text = re.sub(pattern, '', text)
        cleaned_text = ' '.join(cleaned_text.split())
        return cleaned_text

    answer = clean_prediction(answer)
    submission_dict[row['id']] = answer
    print(f"ID: {row['id']} Question: {row['question']} Generated answer: {answer}")
    break
    
df = pd.DataFrame(list(submission_dict.items()), columns=['id', 'answer'])

  0%|          | 0/1507 [00:00<?, ?it/s]

ID: TEST_0000 Question: Ïñ¥Îñ§ Í∏∞Í¥ÄÏù¥ ÏßÄÏó≠ Ï§ëÏÜåÍ∏∞ÏóÖÍ≥º ÏÜåÏÉÅÍ≥µÏù∏Îì§ÏóêÍ≤å ÌÅ∞ ÌûòÏù¥ ÎêòÎäî ÌÜµÏÉÅÏßÑÌù•ÏõêÏù∏Í∞ÄÏöî Generated answer: Í≤ΩÏ†úÌÜµÏÉÅÏßÑÌù•Ïõê





In [4]:
df.to_csv('bert_stride256.csv', index=False)

In [None]:
Ìï†Í±∞: roberta_v2 + answerÎ≥ÄÍ≤Ω(or Ï†úÏùº Ï¢ãÏïòÎçò Î™®Îç∏) -> illuni/illuni-llama-2-ko-7b -> guided text generation