In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
!pip install torch transformers datasets

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b5/d5/c6c23ad75491467a9a84e526ef2364e523d45e2b0fae28a7cbe8689e7e84/transformers-4.8.1-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 5.7MB/s 
[?25hCollecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/08/a2/d4e1024c891506e1cee8f9d719d20831bac31cb5b7416983c4d2f65a6287/datasets-1.8.0-py3-none-any.whl (237kB)
[K     |████████████████████████████████| 245kB 51.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 64.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_1

In [None]:
import json
from pathlib import Path
from tqdm import tqdm
from transformers.data.processors.squad import SquadExample, squad_convert_examples_to_features

def create_examples(input_data, set_type):
    is_training = set_type == "train"
    examples = []

    for entry in tqdm(input_data):
        title = entry["title"]
        for paragraph in entry["paragraphs"]:
            context_text = paragraph["context"]
            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position_character = None
                answer_text = None
                answers = []

                is_impossible = qa.get("is_impossible", False)
                if not is_impossible:
                    if is_training:
                        answer = qa["answers"][0]
                        answer_text = answer["text"]
                        start_position_character = answer["answer_start"]
                    else:
                        answers = qa["answers"]

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    context_text=context_text,
                    answer_text=answer_text,
                    start_position_character=start_position_character,
                    title=title,
                    is_impossible=is_impossible,
                    answers=answers,
                )
                examples.append(example)
    return examples

In [None]:
def get_train_examples(data_dir, filename):
    if data_dir is None:
        data_dir = ""
    if filename is None:
        filename = "ko_nia_normal_squad_all.json"

    path = data_dir + "/" + filename
    path = Path(path)
        
    with open(path, 'rb') as f:
      input_data = json.load(f)["data"]

    return create_examples(input_data, "train")

def get_dev_examples(data_dir, filename):
    if data_dir is None:
        data_dir = ""
    if filename is None:
        filename = "ko_nia_normal_squad_all.json"

    path = data_dir + "/" + filename
    path = Path(path)
        
    with open(path, 'rb') as f:
      input_data = json.load(f)["data"]

    return create_examples(input_data, "dev")

In [None]:
train_examples = get_train_examples("/content/gdrive/MyDrive/Colab_Notebooks/NLP_QA/QA_Dataset/data3", "train.json")

100%|██████████| 9882/9882 [00:23<00:00, 413.55it/s]


In [None]:
cd /content/gdrive/MyDrive/Colab_Notebooks/NLP_QA/KB-ALBERT-KO/kb-albert-char

/content/gdrive/MyDrive/Colab_Notebooks/NLP_QA/KB-ALBERT-KO/kb-albert-char


In [None]:
from transformers import AlbertForQuestionAnswering, TrainingArguments, Trainer
from tokenization_kbalbert import KbAlbertCharTokenizer

MODEL_PATH = "./model"
tokenizer = KbAlbertCharTokenizer.from_pretrained(MODEL_PATH)  

In [None]:
max_length = 384
doc_stride = 128
max_query_length = 64

train_features, train_dataset = squad_convert_examples_to_features(
    examples=train_examples,
    tokenizer=tokenizer,
    max_seq_length=max_length,
    doc_stride=doc_stride,
    max_query_length=max_query_length,
    is_training=True,
    return_dataset="pt",
)

# valid_features, valid_dataset = squad_convert_examples_to_features(
#     examples=train_examples,
#     tokenizer=tokenizer,
#     max_seq_length=max_length,
#     doc_stride=doc_stride,
#     max_query_length=max_query_length,
#     is_training=True,
#     return_dataset="pt",
# )

convert squad examples to features: 100%|██████████| 50023/50023 [25:41<00:00, 32.45it/s]
add example index and unique id: 100%|██████████| 50023/50023 [00:00<00:00, 317492.87it/s]


In [None]:
import time
import logging
import torch
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.tensorboard import SummaryWriter
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import trange
logger = logging.getLogger(__name__)

train_batch_size = 16
gradient_accumulation_steps = 1
num_train_epochs = 3.0
max_grad_norm = 1.0

tb_writer = SummaryWriter()

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
model = AlbertForQuestionAnswering.from_pretrained(MODEL_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)
t_total = len(train_dataloader) //  gradient_accumulation_steps * num_train_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=t_total
)

logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Num Epochs = %d", num_train_epochs)
logger.info("  Gradient Accumulation steps = %d", gradient_accumulation_steps)
logger.info("  Total optimization steps = %d", t_total)

global_step = 1
epochs_trained = 0
steps_trained_in_current_epoch = 0
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()

train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch")

for _ in train_iterator:
  # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
  for step, batch in enumerate(train_dataloader):

    model.train()
    batch = tuple(t.to(device) for t in batch)

    inputs = {
      "input_ids": batch[0],
      "attention_mask": batch[1],
      "token_type_ids": batch[2],
      "start_positions": batch[3],
      "end_positions": batch[4],
    }

    outputs = model(**inputs)

    loss = outputs[0]
    loss.backward()

    tr_loss += loss.item()
    if (step + 1) % gradient_accumulation_steps == 0:
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

      optimizer.step()
      scheduler.step()
      model.zero_grad()
      global_step += 1

    tb_writer.close()

Some weights of the model checkpoint at ./model were not used when initializing AlbertForQuestionAnswering: ['predictions.dense.bias', 'sop_classifier.classifier.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.bias', 'sop_classifier.classifier.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at ./model and are newly initialized: ['qa_outputs

In [None]:
model.save_pretrained("/content/gdrive/MyDrive/Colab_Notebooks/NLP_QA/KBALBERT_MODEL_V2")

In [None]:
tokenizer.save_pretrained("/content/gdrive/MyDrive/Colab_Notebooks/NLP_QA/KBALBERT_MODEL_V2")