In [11]:
import sys
sys.path.append('..')

import time
from tqdm import tqdm

from torch.utils.data import DataLoader

from utils.utils import parse_args, get_reader, load_model, get_out_filename, get_tagset
from pytorch_lightning import seed_everything

In [3]:
def build_parser():
    import argparse
    p = argparse.ArgumentParser(description='Model configuration.', add_help=False)
    p.add_argument('--train', type=str, help='Path to the train data.', default=None)
    p.add_argument('--test', type=str, help='Path to the test data.', default=None)
    p.add_argument('--dev', type=str, help='Path to the dev data.', default=None)

    p.add_argument('--out_dir', type=str, help='Output directory.', default='.')
    p.add_argument('--iob_tagging', type=str, help='IOB tagging scheme', default='conll')

    p.add_argument('--max_instances', type=int, help='Maximum number of instances', default=1500)
    p.add_argument('--max_length', type=int, help='Maximum number of tokens per instance.', default=100)

    p.add_argument('--encoder_model', type=str, help='Pretrained encoder model to use', default='xlm-roberta-large')
    p.add_argument('--model', type=str, help='Model path.', default=None)
    p.add_argument('--model_name', type=str, help='Model name.', default=None)
    p.add_argument('--stage', type=str, help='Training stage', default='fit')
    p.add_argument('--prefix', type=str, help='Prefix for storing evaluation files.', default='test')

    p.add_argument('--batch_size', type=int, help='Batch size.', default=128)
    p.add_argument('--accum_grad_batches', type=int, help='Number of batches for accumulating gradients.', default=1)
    p.add_argument('--gpus', type=int, help='Number of GPUs.', default=1)
    p.add_argument('--cuda', type=str, help='Cuda Device', default='cuda:0')
    p.add_argument('--epochs', type=int, help='Number of epochs for training.', default=5)
    p.add_argument('--lr', type=float, help='Learning rate', default=1e-5)
    p.add_argument('--dropout', type=float, help='Dropout rate', default=0.1)

    return p

In [4]:
parser = build_parser()
sg = parser.parse_args([
    "--test", "../../data/dev/dev.txt", 
    "--out_dir", "../trained_model", 
    "--model_name", "roberta_squad2_final", 
    "--gpus", "1", 
    "--encoder_model", "deepset/roberta-base-squad2", 
    "--batch_size", "32", 
    "--model", "../trained_model/roberta_squad2_lr_2e-5/lightning_logs/version_0", 
    "--max_length", "200", 
])

In [6]:
timestamp = time.time()
sg = parse_args()
out_dir_path = sg.out_dir + '/' + sg.model_name

# load the dataset first
test_data = get_reader(
    file_path=sg.test, 
    target_vocab=get_tagset(sg.iob_tagging), 
    max_instances=sg.max_instances, 
    max_length=sg.max_length, 
    encoder_model=sg.encoder_model, 
)

model, model_file = load_model(
    sg.model, 
    tag_to_id=get_tagset(sg.iob_tagging), 
)
model = model.to(sg.cuda)
# use pytorch lightnings saver here.
eval_file = get_out_filename(sg.out_dir, model_file, prefix=sg.prefix, output_tags=True)

out_str = ''
test_dataloaders = DataLoader(
    test_data, batch_size=sg.batch_size, 
    collate_fn=model.collate_batch, 
    shuffle=False, 
    drop_last=False, 
)
index = 0
for batch in tqdm(test_dataloaders, total=len(test_dataloaders)):
    pred_tags = model.predict_tags(batch, device=sg.cuda)

    for pred_tag_inst in pred_tags:
        out_str += '\n'.join(pred_tag_inst)
        out_str += '\n\n\n'
    index += 1

usage: ipykernel_launcher.py [--train TRAIN] [--test TEST] [--dev DEV]
                             [--out_dir OUT_DIR] [--iob_tagging IOB_TAGGING]
                             [--max_instances MAX_INSTANCES]
                             [--max_length MAX_LENGTH]
                             [--encoder_model ENCODER_MODEL] [--model MODEL]
                             [--model_name MODEL_NAME] [--stage STAGE]
                             [--prefix PREFIX] [--batch_size BATCH_SIZE]
                             [--accum_grad_batches ACCUM_GRAD_BATCHES]
                             [--gpus GPUS] [--cuda CUDA] [--epochs EPOCHS]
                             [--lr LR] [--dropout DROPOUT]
ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme="hmac-sha256" --Session.key=b"31c8f7f2-b83b-49f6-974f-dec5b7130a85" --shell=9002 --transport="tcp" --iopub=9004 --f=c:\Users\holaj\AppData\Roaming\jupyter\runtime\kernel-v2-360

SystemExit: 2

In [8]:
import pandas as pd
import numpy as np

In [12]:
dev_reader = get_reader(
    file_path='../../data/dev/dev.txt', 
    max_length=200, 
    target_vocab=get_tagset('conll'), 
    encoder_model=sg.encoder_model, 
)

2022-09-23 19:14:36 - INFO - reader - Reading file ../../data/dev/dev.txt
2022-09-23 19:14:37 - INFO - reader - Finished reading 99 instances from file ../../data/dev/dev.txt


In [10]:
pd.read_csv('../trained_model/test_base_roberta_squad2_lr_2e-5_timestamp_1663584495.1579309_final_full_output.tsv')

Unnamed: 0,O
0,O
1,O
2,O
3,O
4,O
...,...
11759,B-VAR
11760,O
11761,B-VAR
11762,O
