In [None]:
# Python 3.8, PyTorch 1.7, and Transformers 4.3/4.4.
# !pip install --upgrade pip
# !pip install -qU torch==1.7.1
# !pip install tensorboard==2.5.0 tensorflow==2.5.0 tensorflow-datasets==4.0.1
!pip install -q transformers==4.6.1

In [None]:
!git clone https://github.com/kzinmr/cuad.git

In [None]:
!cd cuad && mkdir data && cp data.zip data/ && cd data && unzip ./data.zip

In [None]:
!cd cuad && mkdir -p ./train_models/roberta-base

# CUAD Dataset
- in sufficient memory entironment

In [30]:
# filter by question type

import json

def filter_data(data, targets=['Parties']):
    print(targets)
    for d in data:
        for p in d['paragraphs']:
            p['qas'] = [qa for qa in p['qas'] if any(target in qa['question'] for target in targets)]

# SquadV2Processor.get_train_examples を以下の絞り込みするように書き換える
targets=['Document Name', 'Parties', 'Agreement Date', 'Effective Date', 'Expiration Date', 'Renewal Term','Notice Period to Terminate Renewal']
with open('./data/train_separate_questions.json') as reader:
    js = json.load(reader)
    print(js.keys())
    data = js["data"]
    version = js['version']
    print(sum(len(p['qas']) for d in data for p in d['paragraphs']))
    filter_data(data, targets)
    print(sum(len(p['qas']) for d in data for p in d['paragraphs']))

# (PARTIES)2012, (MASTER)4190
with open('./data/train_separate_questions_master.json', 'w') as writer:
    j = json.dumps({'data': data, 'version': version})
    writer.write(j)

dict_keys(['version', 'data'])
22450
['Document Name', 'Parties', 'Agreement Date', 'Effective Date', 'Expiration Date', 'Renewal Term', 'Notice Period to Terminate Renewal']
4190


In [31]:

with open('./data/test.json') as reader:
    js = json.load(reader)
    print(js.keys())
    data = js["data"]
    version = js['version']
    print(sum(len(p['qas']) for d in data for p in d['paragraphs']))
    filter_data(data, targets)
    print(sum(len(p['qas']) for d in data for p in d['paragraphs']))

# 102, 612
with open('./data/test_master.json', 'w') as writer:
    j = json.dumps({'data': data, 'version': version})
    writer.write(j)

dict_keys(['version', 'data'])
4182
['Document Name', 'Parties', 'Agreement Date', 'Effective Date', 'Expiration Date', 'Renewal Term', 'Notice Period to Terminate Renewal']
612


In [32]:
from transformers.data.processors.squad import SquadV2Processor

data_dir = './data'
train_file = 'train_separate_questions_master.json'

processor = SquadV2Processor()
examples = processor.get_train_examples(data_dir, filename=train_file)
len(examples)


100%|██████████| 408/408 [01:35<00:00,  4.27it/s]


4190

In [33]:
%%time
from transformers import (
    AutoTokenizer,
    squad_convert_examples_to_features
)
output_dir = './train_models/roberta-base'
model_name_or_path = 'roberta-base'
tokenizer_name = model_name_or_path
do_lower_case=False
cache_dir=None
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name,
    do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=False,
)

evaluate=False
threads=4
max_seq_length=512
max_query_length=256
doc_stride=256

features, dataset = squad_convert_examples_to_features(
    examples=examples,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    max_query_length=max_query_length,
    is_training=not evaluate,
    return_dataset="pt",
    threads=threads,
)

convert squad examples to features: 100%|██████████| 4190/4190 [30:25<00:00,  2.30it/s] 
add example index and unique id: 100%|██████████| 4190/4190 [00:00<00:00, 22101.38it/s]


CPU times: user 1min 20s, sys: 1min 35s, total: 2min 56s
Wall time: 30min 50s


In [34]:
import os
import numpy as np
import torch
def get_dataset_pos_mask(dataset):
    """
    Returns a list, pos_mask, where pos_mask[i] indicates is True if the ith example in the dataset is positive
    (i.e. it contains some text that should be highlighted) and False otherwise.
    """
    pos_mask = []
    for i in range(len(dataset)):
        ex = dataset[i]
        start_pos = ex[3]
        end_pos = ex[4]
        is_positive = end_pos > start_pos
        pos_mask.append(is_positive)
    return pos_mask
def get_balanced_dataset(dataset):
    """
    returns a new dataset, where positive and negative examples are approximately balanced
    """
    pos_mask = get_dataset_pos_mask(dataset)
    neg_mask = [~mask for mask in pos_mask]
    npos, nneg = np.sum(pos_mask), np.sum(neg_mask)

    neg_keep_frac = npos / nneg  # So that in expectation there will be npos negative examples (--> balanced)
    neg_keep_mask = [mask and np.random.random() < neg_keep_frac for mask in neg_mask]

    # keep all positive examples and subset of negative examples
    keep_mask = [pos_mask[i] or neg_keep_mask[i] for i in range(len(pos_mask))]
    keep_indices = [i for i in range(len(keep_mask)) if keep_mask[i]]

    subset_dataset = torch.utils.data.Subset(dataset, keep_indices)
    return subset_dataset

cache_dir = ''
model_name_or_path = 'roberta-base'
max_seq_length=512
subset_cached_features_file = os.path.join(
    cache_dir,
    "balanced_subset_cached_{}_{}_{}".format(
        "dev" if evaluate else "train",
        list(filter(None, model_name_or_path.split("/"))).pop(),
        str(max_seq_length),
    ),
)
b_dataset = get_balanced_dataset(dataset)
torch.save({"dataset": b_dataset}, subset_cached_features_file)

# Training
- in GPU environment

In [None]:
import os
import torch
cache_dir = '/content/cuad/data'
model_name_or_path = 'roberta-base'
max_seq_length=512
evaluate = False
subset_cached_features_file = os.path.join(
    cache_dir,
    "balanced_subset_cached_{}_{}_{}".format(
        "dev" if evaluate else "train",
        list(filter(None, model_name_or_path.split("/"))).pop(),
        str(max_seq_length),
    ),
)
train_dataset = torch.load(subset_cached_features_file)["dataset"]
features, examples = None, None
len(train_dataset)

In [None]:
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
)

cache_dir=None
model_name_or_path = 'roberta-base'
config_name = model_name_or_path
config = AutoConfig.from_pretrained(
    config_name,
    cache_dir=cache_dir,
)
model = AutoModelForQuestionAnswering.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
)

tokenizer_name = model_name_or_path
do_lower_case=False
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name,
    do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=False,
)


In [None]:
%cd /content/cuad

In [None]:
from train import build_args, train
args = build_args(notebook=True)
args.version_2_with_negative = True
args.do_train=True
# args.do_eval=True
args.max_seq_length=512
args.doc_stride=256
args.max_query_length=128
args.per_gpu_train_batch_size=8
args.logging_steps=1000
# args.per_gpu_eval_batch_size=32
model.to(args.device)
args.num_train_epochs = 4
global_step, tr_loss = train(args, train_dataset, model, tokenizer)


In [None]:
output_dir = '/content/cuad/train_models/roberta-base/'
model_to_save = model.module if hasattr(model, "module") else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
torch.save(args, os.path.join(output_dir, "training_args.bin"))

# Evaluate

In [None]:
args.predict_file='/content/cuad/data/test_parties.json'
args.version_2_with_negative=True
from train import evaluate

checkpoint = os.path.join(args.output_dir, 'checkpoint-2000')
checkpoints = [checkpoint]
# Reload the model
global_step =  checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)  # , force_download=True)
model.to(args.device)

# Evaluate
result = evaluate(args, model, tokenizer, prefix=global_step)

result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())


In [None]:
!python evaluate.py

## predict

In [None]:
from evaluate import load_json, get_answers
test_json_path = "./data/test_parties.json"
model_path = "./train_models/roberta-base"
save_dir = "./results"
if not os.path.exists(save_dir): os.mkdir(save_dir)

gt_dict = load_json(test_json_path)
gt_dict = get_answers(gt_dict)

predictions_path = os.path.join(model_path, "nbest_predictions_.json")
name = model_path.split("/")[-1]
pred_dict = load_json(predictions_path)

for k in list(pred_dict.keys()):
    print(k)
    print('GOLDs:', gt_dict[k])

    for i, p in enumerate(pred_dict[k][:5], 1):
        print(f'PRED@{i}:', p)
        print()

# Dataset

In [None]:
# !wget -O CUAD_v1.zip https://zenodo.org/record/4595826/files/CUAD_v1.zip

In [None]:
# CUAD_v1.json       full_contract_pdf  label_group_xlsx
# CUAD_v1_README.txt full_contract_txt  master_clauses.csv

# import zipfile
# with zipfile.ZipFile('CUAD_v1.zip') as zfp:
#     zfp.extractall('./')

In [27]:
# import json
# from collections import defaultdict
# def squadv2_formatter(jd):
#     data = jd['data']
#     print(len(data))
#     datasets = []
#     for d in data:
#         title = d['title']
#         paragraph = d['paragraphs'][0]
#         context = paragraph['context']
#         qas = paragraph['qas']
#         # 'is_impossible'
#         q2id = defaultdict(list)
#         q2text = defaultdict(list)
#         q2answer_start = defaultdict(list)
#         texts = []
#         answer_starts = []
#         for qa in qas:
#             qid = qa['id']
#             question = qa['question']
#             for ad in qa['answers']:
#                 text = ad['text']
#                 answer_start = ad['answer_start']
#                 texts.append(text)
#                 answer_starts.append(answer_start)
#             q2id[question].append(qid)
#             q2text[question].extend(texts)
#             q2answer_start[question].extend(answer_starts)
#         for question in q2id:
#             qid = q2id[question][0]
#             datasets.append({
#                 "answers": {
#                     "answer_start": q2answer_start[question],
#                     "text": q2text[question],
#                 },
#                 "context": context,
#                 "id": qid,
#                 "question": question,
#                 "title": title
#             })
#     print(len(datasets))
#     json_data = {"version": "v2.0", "data": datasets}
#     return json_data


# with open('./data/train_separate_questions.json') as reader:
#     jd = json.load(reader)
#     json_data = squadv2_formatter(jd)
# with open('./data/train.json', 'w') as writer:
#     writer.write(json.dumps(json_data))
#     writer.write('\n')
# with open('./data/test.json') as reader:
#     jd = json.load(reader)
#     json_data = squadv2_formatter(jd)
# with open('./data/test_new.json', 'w') as writer:
#     writer.write(json.dumps(json_data))
#     writer.write('\n')

408
16728
102
4182
