# Changelogs

| Version  | Loss | EM Score | F1 Score | Changes | Comment |
|----------|----------|----------|--------------|---------|---------|
| (1106)v5 | 1.5378 | 42.6 |  42.6  |  initial baseline using KLUE-Roberta-base | 데이터 3천개만 사용 |
| (1107)v1 | 0.679 | 0.221 | 0.6171 | data split ratio:20%, weight decay:2e-5, max answer len:200 추가 | |

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
!pwd

/home/ubuntu/kylee


# Load & Prepare(KLUE)

In [10]:
# PATH='/content/drive/MyDrive/ML/2021AiHubCompetition_1107'
PATH='/home/ubuntu/kylee'
KLUE_workspace = '/KLUE-baseline'
%cd $PATH

/home/ubuntu/kylee


In [None]:
# !git clone https://github.com/katie0809/KLUE-baseline.git
%cd $PATH$KLUE_workspace
!git pull

In [None]:
!git status

### Install libraries

In [None]:
!pip install -qr $PATH$KLUE_workspace'/requirements.txt'

In [None]:
!pip install -q transformers
!pip install -q torchtext==0.8.0 torch==1.7.1 pytorch-lightning==1.2.2

In [None]:
!pip install -q torchtext==0.8.0

In [6]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import seaborn as sns
from sklearn.model_selection import train_test_split
import json
from tqdm import tqdm
import re
import datetime
import sys
import hashlib
import pickle
import yaml

import torch
from torch.utils.data import Dataset, DataLoader
# import torchtext

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *

import sklearn
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import StratifiedKFold

import transformers
from transformers import BertTokenizer,AdamWeightDecay,TFRobertaModel,TFBertModel,BertConfig
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW

# ignore warnings filter
from warnings import simplefilter
simplefilter('ignore')

global bert_inputs, docs, qnas

pd.set_option('display.max_colwidth', None)

2021-11-21 08:56:47.988755: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [7]:
print(tf.__version__)
print(np.__version__)
print(pd.__version__)
print(keras.__version__)
print(sklearn.__version__)
print(transformers.__version__)
# print(pororo.__version__)
print(torch.__version__)

2.3.0
1.18.5
1.3.4
2.4.0
0.24.1
4.11.3
1.6.0


### Variables

In [11]:
model_name = 'klue/roberta-base'
pretrained_model_name = 'aihubresearch_bert_v1'
pretrained_dir = PATH+'/klue-baseline'

data_dir = PATH+'/qa_dataset'
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'
samplesub_file = data_dir + '/sample_submission.csv'

build_dir = PATH+'/build'
# output_dir = build_dir / model_name
# trn_encoded_file = output_dir / 'trn.enc.joblib'
# val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = 'sub.csv'

id_col = 'id'
text_col = 'excerpt'
target_col = 'target'

max_len = 1024
ans_max = 200 # 답변 최대 글자수는 200
epochs = 1
n_fold = 5
n_est = 9
n_stop = 2
batch_size = 8
seed = 42

now = datetime.datetime.now()
year, month, day, hour, min = now.year, now.month, now.day, now.hour, now.minute
display(f'{month}월 {day}일 {hour}시 {min}분 실행')

version = f'{day}{hour}'

'11월 21일 8시 57분 실행'

In [12]:
version

'218'

### Check & Clear GPU Usage

In [None]:
!nvidia-smi

In [None]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

torch.cuda.memory_summary(device=None, abbreviated=False)

# 데이터 전처리

In [None]:
def preprocess(sentence):
  ret = re.sub('[-=+,#/\:^$@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', sentence) # 특수문자 제거
  ret = re.sub(r'[\n\r\t]+', ' ', sentence) # 개행문자 제거
  ret = ret.strip() # 양쪽공백 제거
  return ret

In [None]:
def createTrainSet(docs, qs):
    infos, errs = [], []
    numQ = len(qs)

    doc_ids = docs.index
    titles, contexts = docs['title'], docs['context']
    questions, answers = qs['question'], qs['answer']
    ans_starts = qs['answer_start']

    for idx in tqdm(range(numQ)):
      try:
        doc_id = qs['doc_id'][idx]
        if doc_id in doc_ids:
          curttl = preprocess(titles[doc_id])
          curctx = contexts[doc_id]
          curans = preprocess(curctx[int(ans_starts[idx]):int(ans_starts[idx])+len(answers[idx])])
          curctx = preprocess(curctx)
          ans_start = curctx.find(curans)

          # 논문 원문 자른다. 답변 구간을 포함한 1500자로 자른다.
          offset_st = 0 if ans_start < 300 else np.random.randint(ans_start-300, ans_start)
          offset_end = offset_st + 1200
          curctx = curctx[offset_st:offset_end]

          # 답변의 시작, 끝지점값을 조정된 논문에 맞춰 다시 계산한다.
          ans_start -= offset_st
          ans_end = ans_start+len(curans)

          infos.append({
              'title': curttl,
              'text': curctx,
              'question': questions[idx],
              'answer': curans[:ans_max],
              'answer_start': ans_start,
              'answer_end': ans_end
          })
          
        else:
          errs.append({'idx': idx, 'doc_id': doc_id})
      except Exception as e:
        errs.append({'idx': idx, 'err': e})

    return infos, errs

#### Create new train set

In [None]:
docs = pd.read_csv(PATH+'/documents.csv', index_col=0)
qnas = pd.read_csv(PATH+'/questions.csv')
train, errs = createTrainSet(docs, qnas)

In [None]:
errs

In [None]:
pd.DataFrame(train).to_csv('aihub-mrc-v'+version+'_train.csv', index=False)

#### Load saved train set

In [None]:
train = pd.read_csv('aihub-mrc-v1423_train.csv')
display(train)

In [None]:
del docs, qnas

### 데이터 확인

In [None]:
train.isnull().sum()

- 데이터셋의 answer문장이 context에 제대로 포함되는지 확인
- answer_start가 실제 answer시작점과 일치하는지 확인

In [None]:
for i, el in tqdm(enumerate(train.itertuples())):
  ii = str(el.text).find(el.answer)
  if ii == -1 or ii != el.answer_start:
    print(el)

검증셋 분리

In [None]:
x_train, x_valid = train_test_split(train, test_size=0.2, random_state=42)
print(len(x_train), len(x_valid))

In [None]:
x_train.to_csv(PATH+'/x_train.csv', index=False)
x_valid.to_csv(PATH+'/x_valid.csv', index=False)

In [None]:
x_train = pd.read_csv(PATH+'/x_train.csv')
x_valid = pd.read_csv(PATH+'/x_valid.csv')

In [None]:
display(x_train, x_valid)

In [None]:
x_train[:10000].to_csv(PATH+'/x_train_light.csv', index=False)
x_valid[:1000].to_csv(PATH+'/x_valid_light.csv', index=False)

In [None]:
# del x_train, x_valid

# Train(KoBigbird)

### Load & Prepare(KoBigbird)

In [None]:
Bigbird_workspace = '/KoBigBird'
model_name = "monologg/kobigbird-bert-base"

In [None]:
!git clone https://github.com/monologg/KoBigBird.git
%cd $PATH$Bigbird_workspace
!git pull

In [None]:
!git status

### Install libraries

In [None]:
!pip install transformers==4.11.3
!pip install sentencepiece
!pip install -qr $PATH$Bigbird_workspace'/finetune/requirements.txt'
# !pip install torch==1.8.1
# !pip install git+https://github.com/vasudevgupta7/transformers.git@add_big_bird # TODO: replace with new pip version eventually
# !sudo apt-get install liblzma-dev

In [None]:
import torch
import torch.utils.data as torch_data

### Train

In [None]:
!python $PATH$Bigbird_workspace'/finetune/run.py' \
--task 'qa' \
--dataset 'korquad_2' \
--do_train \
--data_dir $PATH \
--train_file 'x_train_light.csv' \
--predict_file 'x_valid_light.csv' \
--model_name_or_path 'monologg/kobigbird-bert-base' \
--output_dir $PATH'/output' \
--data_dir $PATH \
--learning_rate 3e-5 \
--weight_decay 2e-5 \
--num_train_epochs 1 \
--train_batch_size 1 \
--max_seq_length 2048 \
--doc_stride 128 \
--max_answer_length 2048 \
--gradient_accumulation_steps 4

In [13]:
OUTPUT_DIR = f"{PATH}/output/qa/korquad_2/0-0-ckpt"
OUTPUT_DIR = "/home/ubuntu/2021AiHub-ODQA/models/korquad_2/0-2-ckpt"

In [29]:
from transformers import BigBirdTokenizer, BigBirdForQuestionAnswering, AutoModelForQuestionAnswering, AutoTokenizer
import torch

# model = BigBirdForQuestionAnswering.from_pretrained('google/bigbird-base-trivia-itc')
model = AutoModelForQuestionAnswering.from_pretrained(OUTPUT_DIR)
# tokenizer = AutoTokenizer.from_pretrained('monologg/kobigbird-bert-base')
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)

In [33]:
model = AutoModelForQuestionAnswering.from_pretrained(
    '/home/ubuntu/2021AiHub-ODQA/models/korquad_2/0-0-ckpt',
)
tokenizer = AutoTokenizer.from_pretrained(
    '/home/ubuntu/2021AiHub-ODQA/models/korquad_2/0-0-ckpt', 
    use_fast=False
)

# Train(Klue)

In [None]:
model_name = 'klue/roberta-base'
!python $PATH$KLUE_workspace'/run_klue.py' train --task 'klue-mrc' --data_split_ratio 10 --model_name_or_path $model_name --output_dir $PATH'/output' --data_dir $PATH --learning_rate 3e-5 --weight_decay 2e-5 --num_train_epochs 1 --train_batch_size 1 --eval_batch_size 1 --patience 100000 --max_seq_length 510 --max_answer_length 200 --metric_key rouge_w --gpus 0 --num_workers 4 --verbose_step_count 1000

In [None]:
OUTPUT_DIR = f"{PATH}/output/klue-mrc/version_23"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(OUTPUT_DIR+'/transformers/')

# Save trained pytorch model to a tensorflow model

In [None]:
!pip install onnx-tf

In [None]:
import onnx 
from onnx_tf.backend import prepare
from torch.autograd import Variable

print(onnx.__version__)

Pytorch model -> onnx model

In [None]:
# torch.save(model.state_dict(), PATH+'/output/mnist.pth')

In [None]:
model.load_state_dict(torch.load(PATH+'/output/mnist.pth'))

In [None]:
# Dummy inputs
encodings = tokenizer("이름은 무엇인가?", 
                      "안녕하세요 제 이름은 이경임입니다. 룰루랄라 랄라랄라", 
                      max_length=510, 
                      truncation=True,
                      padding="max_length", 
                      return_token_type_ids=False)

# Export the trained model to ONNX
torch.onnx.export(model, 
                  (encodings['input_ids'], encodings['attention_mask']), 
                  PATH+"/output/mnist.onnx", 
                  opset_version=11)

Onnx model -> Tensorflow model

In [None]:
model = onnx.load(PATH+'/output/mnist.onnx')

# Import the ONNX model to Tensorflow
tf_rep = prepare(model)

In [None]:
tf_rep.export_graph(PATH+'/output/mnist.pb')

# 성능 측정

예측함수

In [39]:
def _prediction(contexts, questions):
    result = []
    
    with torch.no_grad():
        for context, question in tqdm(zip(contexts, questions)):
            # print(context, question)
            encodings = tokenizer(question, 
                                  context, 
                                  max_length=510, 
                                  truncation=True,
                                  padding="max_length", 
                                  return_token_type_ids=False)
            encodings = {key: torch.tensor([val]) for key, val in encodings.items()}
            
            input_ids = encodings["input_ids"]
            attention_mask = encodings["attention_mask"]
            # print(input_ids, attention_mask)
            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits, end_logits = outputs[0], outputs[1]
            token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)
            pred_ids = input_ids[0][token_start_index: token_end_index + 1]
            pred = tokenizer.decode(pred_ids)
            # print(pred)
            result.append(pred)

    return result

def prediction(contexts, questions):
    result = []
    
    with torch.no_grad():
        for context, question in tqdm(zip(contexts, questions)):
            # print(context, question)
            encodings = tokenizer(question, 
                                  context, 
                                 return_tensors="pt",
                                 padding=True,
                                 truncation=True)
#             encodings = {key: torch.tensor([val]) for key, val in encodings.items()}
            
            input_ids = encodings["input_ids"]
            token_type_ids = encodings["token_type_ids"]
            attention_mask = encodings["attention_mask"]
            
            outputs = model(input_ids=input_ids,token_type_ids=token_type_ids,attention_mask=attention_mask)

            start_logits, end_logits = outputs[0], outputs[1]
            token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)
            pred = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encodings["input_ids"][0][token_start_index: token_end_index + 1]))
            pred = pred.replace("#","")
            result.append(pred)

    return result

In [28]:
def print_prds(ctx, qus, prds):
  wrong_prds = []
  for i, prd in enumerate(prds):
    if prd != '' and prd != '[CLS]':
      print(f'\nSAMPLE [{i}] ===============================>')
      display(ctx[i][:400])
      print(f'\nQUESTION ===============> ', qus[i])
      print(f'PREDICTED ANSWER =======> ', prd)
    else:
      wrong_prds.append(i)
  
  print(f'\WRONG [{len(wrong_prds)}] ===============================>')
  print(wrong_prds)

print_prds(ctx, qus, prds)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


F1 Score, EM Score

In [3]:
def normalize_answer(sentence):
  ret = re.sub('[-=+,#/\:^$@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', sentence) # 특수문자 제거
  ret = re.sub(r'[\n\r\t]+', ' ', sentence) # 개행문자 제거
  ret = ret.strip() # 양쪽공백 제거
  return ret

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
  return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1


In [34]:
x_valid = pd.read_csv('/home/ubuntu/2021AiHub-ODQA/models/x_valid_light.csv')

In [35]:
ctx = x_valid['text'].tolist()
qus = x_valid['question'].tolist()
ans = x_valid['answer'].tolist()

In [36]:
stpos, endpos = 10, 12

In [40]:
prds = prediction(ctx[stpos:endpos], qus[stpos:endpos])

2it [00:02,  1.25s/it]


In [41]:
qus[:12], prds[stpos:endpos]

(['상류층의 의상과 대중들의 의상의 경계가 확실했던 대립관계의 경계가 해체되기 시작한 계기는 무엇인가?',
  '자기주도 학습이란?',
  '단색 배색은 무엇인가?',
  '사고선박의 저항은 어떻게 계산하는가?',
  '치수 및 형태가 적합한 장갑의 착용은 보호 장구로서의 역할을 하지만 부적절하게 설계된 장갑은 사용시 어떤 문제점을 가지는가?',
  '에너지진단이란 무엇인가?',
  '정부가 학교폭력 근절 종합대책을 발표하게 된 계기로 어떤 사건이 있었나요?',
  '단안 영상을 통해 3 차원의 입체 영상을 생성하기 위한 관련 연구에는 무엇이 있는가?',
  '암호기술은 어디에 사용되는가?',
  '에너지 관련 프로토콜에는 무엇이 있는가?',
  '하천식생은 어떤 역할을 하는가?',
  '열전소재에는 무엇이 있는가?'],
 [])

In [25]:
prds[:10]

['[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]',
 '[CLS]']

In [None]:
_len, f1scores, emscores = len(prds), [], []
for i in tqdm(range(_len)):
  f1scores.append(compute_f1(ans[i], prds[i]))
  emscores.append(compute_exact(ans[i], prds[i]))

In [None]:
scores = [0, 0]
for i in range(_len):
  scores[0] += f1scores[i]
  scores[1] += emscores[i]

print(f'[Calculated Scores for {_len} data]')
print(f'===> F1: {scores[0]/_len}, EM: {scores[1]/_len}')

# Backups

In [None]:
"""Official evaluation script for SQuAD version 2.0.

In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
"""
import argparse
import collections
import json
import numpy as np
import os
import re
import string
import sys

OPTS = None

def make_qid_to_has_ans(dataset):
  qid_to_has_ans = {}
  for article in dataset:
    for p in article['paragraphs']:
      for qa in p['qas']:
        qid_to_has_ans[qa['id']] = bool(qa['answers'])
  return qid_to_has_ans

def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
  return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

def get_raw_scores(dataset, preds):
  exact_scores = {}
  f1_scores = {}
  for article in dataset:
    for p in article['paragraphs']:
      for qa in p['qas']:
        qid = qa['id']
        gold_answers = [a['text'] for a in qa['answers']
                        if normalize_answer(a['text'])]
        if not gold_answers:
          # For unanswerable questions, only correct answer is empty string
          gold_answers = ['']
        if qid not in preds:
          print('Missing prediction for %s' % qid)
          continue
        a_pred = preds[qid]
        # Take max over all gold answers
        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
        f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
  return exact_scores, f1_scores

def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
  new_scores = {}
  for qid, s in scores.items():
    pred_na = na_probs[qid] > na_prob_thresh
    if pred_na:
      new_scores[qid] = float(not qid_to_has_ans[qid])
    else:
      new_scores[qid] = s
  return new_scores

def make_eval_dict(exact_scores, f1_scores, qid_list=None):
  if not qid_list:
    total = len(exact_scores)
    return collections.OrderedDict([
        ('exact', 100.0 * sum(exact_scores.values()) / total),
        ('f1', 100.0 * sum(f1_scores.values()) / total),
        ('total', total),
    ])
  else:
    total = len(qid_list)
    return collections.OrderedDict([
        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
        ('total', total),
    ])

def merge_eval(main_eval, new_eval, prefix):
  for k in new_eval:
    main_eval['%s_%s' % (prefix, k)] = new_eval[k]

def plot_pr_curve(precisions, recalls, out_image, title):
  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.xlim([0.0, 1.05])
  plt.ylim([0.0, 1.05])
  plt.title(title)
  plt.savefig(out_image)
  plt.clf()

def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
                               out_image=None, title=None):
  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
  true_pos = 0.0
  cur_p = 1.0
  cur_r = 0.0
  precisions = [1.0]
  recalls = [0.0]
  avg_prec = 0.0
  for i, qid in enumerate(qid_list):
    if qid_to_has_ans[qid]:
      true_pos += scores[qid]
    cur_p = true_pos / float(i+1)
    cur_r = true_pos / float(num_true_pos)
    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
      # i.e., if we can put a threshold after this point
      avg_prec += cur_p * (cur_r - recalls[-1])
      precisions.append(cur_p)
      recalls.append(cur_r)
  if out_image:
    plot_pr_curve(precisions, recalls, out_image, title)
  return {'ap': 100.0 * avg_prec}

def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
                                  qid_to_has_ans, out_image_dir):
  if out_image_dir and not os.path.exists(out_image_dir):
    os.makedirs(out_image_dir)
  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
  if num_true_pos == 0:
    return
  pr_exact = make_precision_recall_eval(
      exact_raw, na_probs, num_true_pos, qid_to_has_ans,
      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
      title='Precision-Recall curve for Exact Match score')
  pr_f1 = make_precision_recall_eval(
      f1_raw, na_probs, num_true_pos, qid_to_has_ans,
      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
      title='Precision-Recall curve for F1 score')
  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
  pr_oracle = make_precision_recall_eval(
      oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
  merge_eval(main_eval, pr_exact, 'pr_exact')
  merge_eval(main_eval, pr_f1, 'pr_f1')
  merge_eval(main_eval, pr_oracle, 'pr_oracle')

def histogram_na_prob(na_probs, qid_list, image_dir, name):
  if not qid_list:
    return
  x = [na_probs[k] for k in qid_list]
  weights = np.ones_like(x) / float(len(x))
  plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
  plt.xlabel('Model probability of no-answer')
  plt.ylabel('Proportion of dataset')
  plt.title('Histogram of no-answer probability: %s' % name)
  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
  plt.clf()

def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
  cur_score = num_no_ans
  best_score = cur_score
  best_thresh = 0.0
  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
  for i, qid in enumerate(qid_list):
    if qid not in scores: continue
    if qid_to_has_ans[qid]:
      diff = scores[qid]
    else:
      if preds[qid]:
        diff = -1
      else:
        diff = 0
    cur_score += diff
    if cur_score > best_score:
      best_score = cur_score
      best_thresh = na_probs[qid]
  return 100.0 * best_score / len(scores), best_thresh

def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
  best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
  best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
  main_eval['best_exact'] = best_exact
  main_eval['best_exact_thresh'] = exact_thresh
  main_eval['best_f1'] = best_f1
  main_eval['best_f1_thresh'] = f1_thresh

def main():
  with open(OPTS.data_file) as f:
    dataset_json = json.load(f)
    dataset = dataset_json['data']
  with open(OPTS.pred_file) as f:
    preds = json.load(f)
  if OPTS.na_prob_file:
    with open(OPTS.na_prob_file) as f:
      na_probs = json.load(f)
  else:
    na_probs = {k: 0.0 for k in preds}
  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
  exact_raw, f1_raw = get_raw_scores(dataset, preds)
  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                        OPTS.na_prob_thresh)
  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
                                     OPTS.na_prob_thresh)
  out_eval = make_eval_dict(exact_thresh, f1_thresh)
  if has_ans_qids:
    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
    merge_eval(out_eval, has_ans_eval, 'HasAns')
  if no_ans_qids:
    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
    merge_eval(out_eval, no_ans_eval, 'NoAns')
  if OPTS.na_prob_file:
    find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
  if OPTS.na_prob_file and OPTS.out_image_dir:
    run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
                                  qid_to_has_ans, OPTS.out_image_dir)
    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
  if OPTS.out_file:
    with open(OPTS.out_file, 'w') as f:
      json.dump(out_eval, f)
  else:
    print(json.dumps(out_eval, indent=2))


In [None]:
train_full_raw = pd.read_csv('train_full_raw.csv')

In [None]:
train_full_raw.keys()

In [None]:
# train_full_raw[train_full_raw['context'].apply(lambda x : '부부별산제' in x)]
train_full_raw[train_full_raw['context'].apply(lambda x : len(x) > 10000)].index

In [None]:
idx = 512
samplet, sampleq, samplea = train_full_raw.iloc[idx]['context'], train_full_raw.iloc[idx]['question'], train_full_raw.iloc[idx]['answer']
txtlen = len(samplet)

In [None]:
start, offset, padding = 0, 1500, 300
end = start + offset
print(sampleq)
while True:
  if start > txtlen:
    break
  # print(samplet[start:end])
  print(f'[PRED] {prediction([samplet[start:end]], [sampleq])}')
  start = start + offset - padding
  end = start + offset

In [None]:
prediction([samplet], [sampleq])

In [None]:
samplea