# Generate Examples
---


In [None]:
import os
import json

from tqdm import tqdm

from valerie.data import load_claims
from valerie.utils import get_logger
from valerie.modeling import SequenceClassificationModel
from valerie.modeling import SequenceClassificationExample

In [None]:
examples_dir = os.path.join("models/phase2", "single-claim-claimant-date")

train_examples_file = os.path.join(examples_dir, "train_examples_combined.json")
test_examples_file = os.path.join(examples_dir, "test_examples.json")
trial_examples_file = os.path.join(examples_dir, "trial_examples.json")

In [None]:
train_claims = load_claims("data/combined/phase1-phase2/claims.json")
test_claims = load_claims("data/phase2/test-data/claims.json")
trial_claims = load_claims("data/phase2/trial-data/claims.json")

In [None]:
def generate_examples(claims):
    examples = []
    for k, claim in tqdm(claims.items(), desc="generating examples"):
        examples.append(
            SequenceClassificationExample(
                guid=k,
                text_a=claim.claim,
                text_b=(claim.claimant if claim.claimant else "no claimant") + " " + (claim.date.split()[0] if claim.date else "no date"),
                label=claim.label
            )
        )
    return examples

In [None]:
os.makedirs(examples_dir)

train_examples = generate_examples(train_claims)
test_examples = generate_examples(test_claims)
trial_examples = generate_examples(trial_claims)

with open(train_examples_file, "w") as fo:
    json.dump([e.__dict__ for e in train_examples], fo, indent=2)
with open(test_examples_file, "w") as fo:
    json.dump([e.__dict__ for e in test_examples], fo, indent=2)
with open(trial_examples_file, "w") as fo:
    json.dump([e.__dict__ for e in trial_examples], fo, indent=2)

In [None]:
train_examples = SequenceClassificationModel.load_examples(train_examples_file)
test_examples = SequenceClassificationModel.load_examples(test_examples_file)
trial_examples = SequenceClassificationModel.load_examples(trial_examples_file)

In [None]:
print(json.dumps(train_examples[0].__dict__, indent=2))

# Train
---

In [None]:
# _logger = get_logger()

In [None]:
pretrained_model_name_or_path = "roberta-large"
max_seq_length = 128
trail_num = 2
n_splits = 0
is_combined = True

props = [pretrained_model_name_or_path]
if is_combined:
    props += ["combined"]
if n_splits:
    props += [f"{n_splits}fold"]
props += [str(max_seq_length), str(trail_num)]

output_dir = os.path.join(examples_dir, "-".join(props))
assert not os.path.exists(output_dir)
print(output_dir)

In [None]:
i = 1
while i <= 128:
    bn = round(len(train_examples)/(i))
    print(f"{i}:\t{bn} / 8 gpus\t= {round(bn / 8)}")
    i *= 2

In [None]:
data_args = {
    "train_examples_file": train_examples_file,
    "test_examples_file": test_examples_file,
}
training_args = {
    "evaluate_during_training": True,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "gradient_accumulation_steps": 1,
    "learning_rate": 5e-5, # change this back to 2e-5 possibly
    "weight_decay": 0.00,
    "adam_epsilon": 1e-6,
    "max_grad_norm": 1.0,
    "num_train_epochs": 6,
    "warmup_steps": 100,
    "logging_first_step": False,
    "logging_steps": 25,
    "save_steps": 1e9,
    "save_total_limit": 1,
    "seed": 42
}
config_args = {
    "num_labels": 3,
    "id2label": {
        "0": "false",
        "1": "partly",
        "2": "true"
    },
    "label2id": {
        "false": 0,
        "partly": 1, 
        "true": 2,
    }
}
tokenizer_args = {
    "model_max_length": max_seq_length
}
model_args = {}

In [None]:
model, train_dataset, test_dataset = SequenceClassificationModel.train_from_pretrained(
    output_dir=output_dir,
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    train_examples=train_examples,
    test_examples=test_examples,
    data_args=data_args,
    training_args=training_args,
    config_args=config_args,
    tokenizer_args=tokenizer_args,
    model_args=model_args,
    nproc=2,
)

# Eval
---

In [None]:
import collections

import numpy as np
from sklearn.metrics import classification_report

In [None]:
model = SequenceClassificationModel.from_pretrained("models/phase2/single-claim-claimant/bert-base-cased-combined-128-2")

In [None]:
# test_dataset = model.create_dataset(test_examples)

### Test Data

In [None]:
predict_output = model.predict(test_dataset, predict_batch_size=8)

In [None]:
_labels = []
_preds = []
for example, prob in zip(test_examples, predict_output.predictions):
    _labels.append(example.label)
    _preds.append(np.argmax(prob))

assert len(_labels) == len(test_claims)   
print(classification_report(_labels, _preds))

### Trial Data

In [None]:
trial_dataset = model.create_dataset(trial_examples)
predict_output = model.predict(trial_dataset, predict_batch_size=1)

In [None]:
_labels = []
_preds = []
for example, prob in zip(trial_examples, predict_output.predictions):
    _labels.append(example.label)
    _preds.append(np.argmax(prob))

assert len(_labels) == len(trial_claims)   
print(classification_report(_labels, _preds))