## BERT Training

- modeling.py
- optimization.py
- tokenization.py 
- run_squad.py
- run_classifier.py

`python run_squad.py \
  --vocab_file=$BERT_BASE_DIR/vocab.txt \
  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
  --do_train=True \
  --train_file=$SQUAD_DIR/train-v1.1.json \
  --do_predict=True \
  --predict_file=$SQUAD_DIR/dev-v1.1.json \
  --train_batch_size=12 \
  --learning_rate=3e-5 \
  --num_train_epochs=2.0 \
  --max_seq_length=384 \
  --doc_stride=128 \
  --output_dir=/tmp/squad_base/`


`python run_classifier.py \
  --task_name=MRPC \
  --do_train=true \
  --do_eval=true \
  --data_dir=$GLUE_DIR/MRPC \
  --vocab_file=$BERT_BASE_DIR/vocab.txt \
  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
  --max_seq_length=128 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=3.0 \
  --output_dir=/tmp/mrpc_output/`

In [0]:
import urllib.request
from google.colab import drive

drive.mount('/content/gdrive')

SAVE_DIR = '/content/gdrive/My Drive/ai-summit/'
def download(url, savedir=SAVE_DIR):
  file_name = url.split('/')[-1]
  download_file = savedir + file_name
  urllib.request.urlretrieve(url, download_file)

download('https://raw.githubusercontent.com/google-research/bert/master/modeling.py')
download('https://raw.githubusercontent.com/google-research/bert/master/optimization.py')
download('https://raw.githubusercontent.com/google-research/bert/master/tokenization.py')
download('https://raw.githubusercontent.com/google-research/bert/master/run_squad.py')
download('https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!ls /content/gdrive/My\ Drive/ai-summit/

 BERT-training.ipynb   model	     optimization.py	 run_squad.py
'환경 설정.ipynb'      modeling.py   run_classifier.py	 tokenization.py


### merge train, dev files

In [0]:
import glob
import json
from collections import defaultdict

def jsonmerge(dir, output, pattern='*.json'):
  files = glob.glob(dir + pattern)
  merged = defaultdict(list)
  version = ''
  for f in files:
    print(f)
    with open(f, 'r', encoding='utf-8') as f:
      js = json.load(f)
      version = js['version']
      data = js['data']
      merged['data'].append(data)

  merged['version'] = version

  with open(output, 'w') as f:
    json.dump(merged, f)

    

jsonmerge(SAVE_DIR + 'model/train/', SAVE_DIR + 'train.json')
jsonmerge(SAVE_DIR + 'model/dev/', SAVE_DIR + 'dev.json')

!ls /content/gdrive/My\ Drive/ai-summit/


/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_00.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_01.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_02.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_03.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_04.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_05.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_06.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_07.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_08.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_09.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_10.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_11.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_12.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_

In [0]:
## 기본 메모리 부족
import json
json_file = open('/content/gdrive/My Drive/ai-summit/train.json', encoding='utf-8')
json_data = json.load(json_file)

print(len(json_data['data']))

In [0]:
## SQuAD data

download('https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json', '/content/gdrive/My Drive/ai-summit/')

import json

def showJson(f, showcnt=0):
  json_file = open(f)
  json_data = json.load(json_file)
  cnt = len(json_data['data'])
  if showcnt >= cnt:
    showcnt = cnt
  print(json.dumps(json_data['data'][showcnt], indent=2))

In [0]:
showJson('/content/gdrive/My Drive/ai-summit/train-v2.0.json', 20)

{
  "title": "Wayback_Machine",
  "paragraphs": [
    {
      "qas": [
        {
          "question": "Where does the information stored on the Wayback Machine come from?",
          "id": "56ddb46c9a695914005b958e",
          "answers": [
            {
              "text": "World Wide Web",
              "answer_start": 48
            }
          ],
          "is_impossible": false
        },
        {
          "question": "Which company made the Wayback Machine?",
          "id": "56ddb46c9a695914005b958f",
          "answers": [
            {
              "text": "Internet Archive",
              "answer_start": 116
            }
          ],
          "is_impossible": false
        },
        {
          "question": "Where is Internet Archive headquartered?",
          "id": "56ddb46c9a695914005b9590",
          "answers": [
            {
              "text": "San Francisco",
              "answer_start": 169
            }
          ],
          "is_impossible": false
        

#### KorQuAD vs SQuAD

- KorQuAD
  1. ROOT : version, data(list)
  2. data : title, url, context, raw_html, qas
  3. qas(list) : answer(list), id, question
  4. answer : answer_start, html_answer_start, html_answer_text, text,

- SQuAD
  1. ROOT : version, data(list)
  2. data : title, paragraphs(list)
  3. paragraphs : context, qas
  4. qas(list) : answers, id, is_impossible, question
  5. answers : answer_start, text





In [0]:
def KorToStanford(dir, output, pattern='*.json', cnt=-1):
    files = glob.glob(dir + pattern)
    version = ''
    datalist = []
    for f in files:
        print(f)
        if cnt == 0:
          break
        json_file = open(f, encoding='utf-8')
        json_data = json.load(json_file)
        version = json_data['version']
        dataz = json_data['data']
        for data in dataz:
            if 'qas' in data.keys():
                paragraphs = {}
                qaslist = data['qas']
                for qas in qaslist:
                    qas['answer'].pop('html_answer_start')  # html_answer_start 삭제
                    qas['answer'].pop('html_answer_text')  # html_answer_text 삭제
                    qas['answers'] = [qas['answer']]  # answer --> answers
                    qas['is_impossible'] = False  # is_impossible 삽입
                    qas.pop('answer')  # answer 삭제
                paragraphs['title'] = data['title']
                paragraphs['paragraphs'] = [{'context': data['context'], 'qas': qaslist}]
                datalist.append(paragraphs)
                cnt -= 1

    changed = {'version': version, 'data': datalist}

    with open(output, 'w') as f:
        json.dump(changed, f)

In [0]:
KorToStanford('/content/gdrive/My Drive/ai-summit/model/train/', '/content/gdrive/My Drive/ai-summit/new_train.json', cnt=1000)
KorToStanford('/content/gdrive/My Drive/ai-summit/model/dev/', '/content/gdrive/My Drive/ai-summit/new_dev.json', cnt=300)

/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_00.json
/content/gdrive/My Drive/ai-summit/model/train/korquad2.1_train_01.json
/content/gdrive/My Drive/ai-summit/model/dev/korquad2.1_dev_00.json
/content/gdrive/My Drive/ai-summit/model/dev/korquad2.1_dev_01.json
/content/gdrive/My Drive/ai-summit/model/dev/korquad2.1_dev_02.json
/content/gdrive/My Drive/ai-summit/model/dev/korquad2.1_dev_03.json
/content/gdrive/My Drive/ai-summit/model/dev/korquad2.1_dev_04.json


In [0]:
showJson('/content/gdrive/My Drive/ai-summit/new_train.json', 34)

{
  "title": "\uc778\uc81c\uc74d",
  "paragraphs": [
    {
      "context": "<!DOCTYPE html>\n<html>\n<head>\n<meta>\n<title>\uc778\uc81c\uc74d - \uc704\ud0a4\ubc31\uacfc, \uc6b0\ub9ac \ubaa8\ub450\uc758 \ubc31\uacfc\uc0ac\uc804</title>\n\n\n<link>\n\n<meta>\n<link>\n<meta>\n<meta>\n<meta>\n<meta>\n<meta>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n\n</head>\n<body>\n<div></div>\n<div></div>\n<div>\n<a></a>\n<div></div>\n<div>\n</div>\n<h1>\uc778\uc81c\uc74d</h1>\n<div>\n<div>\uc704\ud0a4\ubc31\uacfc, \uc6b0\ub9ac \ubaa8\ub450\uc758 \ubc31\uacfc\uc0ac\uc804.</div>\n<div></div>\n<div></div>\n<a>\ub458\ub7ec\ubcf4\uae30\ub85c \uac00\uae30</a>\n<a>\uac80\uc0c9\ud558\ub7ec \uac00\uae30</a>\n<div><div><table>\n<caption><span><a>\uac15\uc6d0\ub3c4</a> <a>\uc778\uc81c\uad70</a></span>\n</caption>\n<tbody><tr>\n<th colspan=\"2\"><span><b>\uc778\uc81c\uc74d</b></span>\n</th></tr>\n<tr>\n<td colspan=\"2\">\u9e9f\u8e44\u9091 Inje-eup\n</td></tr>\n<tr>\

In [0]:
showJson('/content/gdrive/My Drive/ai-summit/new_dev.json', 20)

{
  "title": "\uae40\uc0c1\ud6c8_(1963\ub144)",
  "paragraphs": [
    {
      "context": "<!DOCTYPE html>\n<html>\n<head>\n<meta>\n<title>\uae40\uc0c1\ud6c8 (1963\ub144) - \uc704\ud0a4\ubc31\uacfc, \uc6b0\ub9ac \ubaa8\ub450\uc758 \ubc31\uacfc\uc0ac\uc804</title>\n\n\n<link>\n\n<meta>\n<link>\n<meta>\n<meta>\n<meta>\n<meta>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n<link>\n\n</head>\n<body>\n<div></div>\n<div></div>\n<div>\n<a></a>\n<div></div>\n<div>\n</div>\n<h1>\uae40\uc0c1\ud6c8 (1963\ub144)</h1>\n<div>\n<div>\uc704\ud0a4\ubc31\uacfc, \uc6b0\ub9ac \ubaa8\ub450\uc758 \ubc31\uacfc\uc0ac\uc804.</div>\n<div></div>\n<div></div>\n<a>\ub458\ub7ec\ubcf4\uae30\ub85c \uac00\uae30</a>\n<a>\uac80\uc0c9\ud558\ub7ec \uac00\uae30</a>\n<div><div><table><tbody><tr><th colspan=\"2\"><div><div></div><span>\uae40\uc0c1\ud6c8</span></div></th></tr><tr><th colspan=\"2\">\uae30\ubcf8 \uc815\ubcf4</th></tr><tr><th>\uad6d\uc801</th>\n<td>\n\ub300\ud55c\ubbfc\uad6d</td>

### Training

In [0]:
!python /content/gdrive/My\ Drive/ai-summit/run_squad.py \
--vocab_file=/content/gdrive/My\ Drive/ai-summit/model/multi_cased_L-12_H-768_A-12/vocab.txt \
--bert_config_file=/content/gdrive/My\ Drive/ai-summit/model/multi_cased_L-12_H-768_A-12/bert_config.json \
--init_checkpoint=/content/gdrive/My\ Drive/ai-summit/model/multi_cased_L-12_H-768_A-12/bert_model.ckpt \
--do_train=True \
--train_file=/content/gdrive/My\ Drive/ai-summit/new_train.json \
--do_predict=True \
--predict_file=/content/gdrive/My\ Drive/ai-summit/new_dev.json \
--train_batch_size=16 \
--learning_rate=3e-5 \
--num_train_epochs=2.0 \
--max_seq_length=256 \
--doc_stride=128 \
--output_dir=/content/gdrive/My\ Drive/ai-summit/tmp \
--do_lower_case=False

Output hidden; open in https://colab.research.google.com to view.