In [None]:
# Python 3.8, PyTorch 1.7, and Transformers 4.3/4.4.
# !pip install --upgrade pip
# !pip install -qU torch==1.7.1
!pip install -q transformers==4.6.1

In [None]:
!pip install tensorboard==2.5.0 tensorflow==2.5.0 tensorflow-datasets==4.0.1

In [None]:
!git clone https://github.com/kzinmr/cuad.git

In [None]:
!cd cuad && mkdir data && cp data.zip data/ && cd data && unzip ./data.zip

In [None]:
!cd cuad && mkdir -p ./train_models/roberta-base

# Training

In [None]:
import os
import torch
cache_dir = ''
model_name_or_path = 'roberta-base'
max_seq_length=512
evaluate = False
subset_cached_features_file = os.path.join(
    cache_dir,
    "balanced_subset_cached_{}_{}_{}".format(
        "dev" if evaluate else "train",
        list(filter(None, model_name_or_path.split("/"))).pop(),
        str(max_seq_length),
    ),
)
dataset = torch.load(subset_cached_features_file)["dataset"]
features, examples = None, None

In [None]:
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
)

cache_dir=None
model_name_or_path = 'roberta-base'
config_name = model_name_or_path
config = AutoConfig.from_pretrained(
    config_name,
    cache_dir=cache_dir,
)
model = AutoModelForQuestionAnswering.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    cache_dir=cache_dir,
)
# model.to(device)
tokenizer_name = model_name_or_path
do_lower_case=False
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name,
    do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=False,
)

cache_dir = ''
# model_name_or_path = 'roberta-base'
max_seq_length=512
subset_cached_features_file = os.path.join(
    cache_dir,
    "balanced_subset_cached_{}_{}_{}".format(
        "dev" if evaluate else "train",
        list(filter(None, model_name_or_path.split("/"))).pop(),
        str(max_seq_length),
    ),
)
train_dataset = torch.load(subset_cached_features_file)["dataset"]

In [None]:
from train import build_args, train
args = build_args(notebook=True)
args.device = 'gpu'
global_step, tr_loss = train(args, train_dataset, model, tokenizer)

# CUAD Dataset

In [None]:
import json

def filter_data(data, targets=['Parties']):
    for d in data:
        for p in d['paragraphs']:
            p['qas'] = [qa for qa in p['qas'] if any(target in qa['question'] for target in targets)]

# SquadV2Processor.get_train_examples を以下の絞り込みするように書き換える
with open('./data/train_separate_questions.json') as reader:
    js = json.load(reader)
    print(js.keys())
    data = js["data"]
    version = js['version']
    print(sum(len(p['qas']) for d in data for p in d['paragraphs']))
    filter_data(data, targets=['Parties'])
    print(sum(len(p['qas']) for d in data for p in d['paragraphs']))

with open('./data/train_separate_questions_parties.json', 'w') as writer:
    j = json.dumps({'data': data, 'version': version})
    writer.write(j)

In [None]:
# !cd cuad && ./run.sh
!python train.py --output_dir ./train_models/roberta-base --model_type roberta --model_name_or_path roberta-base --train_file ./data/train_separate_questions_parties.json --predict_file ./data/test.json --do_train --do_eval --version_2_with_negative --learning_rate 1e-4 --num_train_epochs 4 --per_gpu_eval_batch_size=40 --per_gpu_train_batch_size=40 --max_seq_length 512 --max_answer_length 128 --doc_stride 128 --save_steps 1000 --n_best_size 20 --overwrite_output_dir --threads 4
# --max_answer_length 512

In [None]:
from transformers.data.processors.squad import SquadV2Processor

data_dir = './data'
train_file = 'train_separate_questions_parties.json'

processor = SquadV2Processor()
examples = processor.get_train_examples(data_dir, filename=train_file)
len(examples)


In [None]:
%%time
from transformers import (
    AutoTokenizer,
    squad_convert_examples_to_features
)
output_dir = './train_models/roberta-base'
model_name_or_path = 'roberta-base'
tokenizer_name = model_name_or_path
do_lower_case=False
cache_dir=None
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name,
    do_lower_case=do_lower_case,
    cache_dir=cache_dir,
    use_fast=False,
)

evaluate=False
threads=4
max_seq_length=512
max_query_length=256
doc_stride=256

features, dataset = squad_convert_examples_to_features(
    examples=examples,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    max_query_length=max_query_length,
    is_training=not evaluate,
    return_dataset="pt",
    threads=threads,
)

In [None]:
import os
import numpy as np
import torch
def get_dataset_pos_mask(dataset):
    """
    Returns a list, pos_mask, where pos_mask[i] indicates is True if the ith example in the dataset is positive
    (i.e. it contains some text that should be highlighted) and False otherwise.
    """
    pos_mask = []
    for i in range(len(dataset)):
        ex = dataset[i]
        start_pos = ex[3]
        end_pos = ex[4]
        is_positive = end_pos > start_pos
        pos_mask.append(is_positive)
    return pos_mask
def get_balanced_dataset(dataset):
    """
    returns a new dataset, where positive and negative examples are approximately balanced
    """
    pos_mask = get_dataset_pos_mask(dataset)
    neg_mask = [~mask for mask in pos_mask]
    npos, nneg = np.sum(pos_mask), np.sum(neg_mask)

    neg_keep_frac = npos / nneg  # So that in expectation there will be npos negative examples (--> balanced)
    neg_keep_mask = [mask and np.random.random() < neg_keep_frac for mask in neg_mask]

    # keep all positive examples and subset of negative examples
    keep_mask = [pos_mask[i] or neg_keep_mask[i] for i in range(len(pos_mask))]
    keep_indices = [i for i in range(len(keep_mask)) if keep_mask[i]]

    subset_dataset = torch.utils.data.Subset(dataset, keep_indices)
    return subset_dataset

cache_dir = ''
model_name_or_path = 'roberta-base'
max_seq_length=512
subset_cached_features_file = os.path.join(
    cache_dir,
    "balanced_subset_cached_{}_{}_{}".format(
        "dev" if evaluate else "train",
        list(filter(None, model_name_or_path.split("/"))).pop(),
        str(max_seq_length),
    ),
)
b_dataset = get_balanced_dataset(dataset)
torch.save({"dataset": b_dataset}, subset_cached_features_file)

# Dataset

In [None]:
# !wget -O CUAD_v1.zip https://zenodo.org/record/4595826/files/CUAD_v1.zip

In [None]:
# CUAD_v1.json       full_contract_pdf  label_group_xlsx
# CUAD_v1_README.txt full_contract_txt  master_clauses.csv

# import zipfile
# with zipfile.ZipFile('CUAD_v1.zip') as zfp:
#     zfp.extractall('./')