In [1]:

import os
os.environ['CUDA_VISIBLE_DEVICES']='2'
import json
import re
from datetime import datetime
import sys
import argparse

import torch
from torch.utils.data import DataLoader
from dotenv import load_dotenv

load_dotenv()
DATA_DIR = os.getenv('DATA_DIR')
CACHE_DIR = os.getenv('CACHE_DIR')
OUT_DIR = os.getenv('OUT_DIR')
FINECITE_PATH = os.getenv('FINECITE_PATH')
if FINECITE_PATH not in sys.path:
    sys.path.append(FINECITE_PATH)

from finecite.utils import set_seed, get_class_weights
from finecite.data_processing import load_processor
from finecite.model import CustomTrainer, ExtractionModel, ClassificationModel, load_classifier, load_tokenizer_embedding_model, MODEL_DESCRIPTION

In [None]:
parser = argparse.ArgumentParser(description='Seq_tagger parser')
args = parser.parse_args([])
args.model_name = 'scibert' # scibert llm2vec_mistral llm2vec_llama3
args.ext_type = 'bilstm_crf' # linear, bilstm, crf, bilstm_crf
args.iob_labels = False

args.batch_size = 4
args.learning_rate = 3e-05
args.crf_learning_rate = 0.005
args.dropout = 0.1

args.save_model = True
args.debug = True
args.debug_size = 100
args.seed = 4455

In [None]:
args.dataset = 'finecite'
args.task = 'ext'

args.max_epochs = 20
args.patients = 5
args.adam_epsilon = 1e-08
args.weight_decay = 0.0
args.use_prompt = 'llm2vec' in args.model_name
args.dtype = torch.float32

# model description
args.model_desc = MODEL_DESCRIPTION[args.model_name]
args.max_len = args.model_desc['max_len']

# input directory
args.input_dir = f'{DATA_DIR}/{args.dataset}/'

# output directory
args.output_dir = re.sub(r'[.:*?"<>|\s-]','_',(
    f"{OUT_DIR}/"
    f"{'_debug/' if args.debug else ''}"
    f"{args.dataset}/{args.model_name}/"
    f"{args.ext_type}"
    f"{'__' + datetime.now().strftime('%m_%d_%H_%M_%S') if args.debug else ''}/"
))
args.model_output_file = args.output_dir + 'safetensors.pt'

os.makedirs(args.output_dir, exist_ok=True)

# model cache dir
if 'llm2vec' in args.model_name:
    args.base_model_dir =  f'{CACHE_DIR}/{args.model_name}/'
    os.makedirs(args.base_model_dir, exist_ok=True)

set_seed(args.seed)
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
#load model and tokenizer
print('loading model embedding model...')
tokenizer, embedding_model = load_tokenizer_embedding_model(args)

loading model embedding model...


loading scibert model...


In [5]:
#load data processor
processor = load_processor(args, tokenizer)

# load data
train_data = processor.read_data('train')
test_data = processor.read_data('test')

# create dataset
train_ds, weights, num_labels = processor.create_features(train_data)
test_ds, _ , _ = processor.create_features(test_data)
args.label_weights = weights
args.num_labels = num_labels
args.num_training_steps = int(len(train_data) / args.batch_size) * args.max_epochs

print(len(train_ds), len(test_ds))

#Dataloader
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=args.batch_size, num_workers=0) 
val_dataloader =  DataLoader(test_ds, shuffle=True, batch_size=args.batch_size, num_workers=0)

100 97


In [6]:
print('loading extraction model...')
ext_model = ExtractionModel(args, embedding_model)

loading extraction model...


In [7]:
# log model setup
print(f'Logging run_setup')
def is_json_serializable(value):
    try:
        json.dumps(value)
        return True
    except (TypeError, OverflowError):
        return False

args_dict = vars(args)
filtered_args = {k: v for k, v in args_dict.items() if is_json_serializable(v)}
with open(os.path.join(args.output_dir, f'run_setup.json'), 'w') as f_out:
    json.dump(filtered_args, f_out, indent=4)

# log imput sample
print(f'Logging input sample')
input_sample = [tokenizer.convert_ids_to_tokens(ids=train_ds[i]['input_ids']) for i in range(3)]

with open(os.path.join(args.output_dir, f'input_sample.json'), 'w') as f_out:
    json.dump(input_sample, f_out, indent=4)
    
# print sample text
first_example = train_ds[0]
sample_text = tokenizer.convert_ids_to_tokens(ids=first_example['input_ids'])
print(f'First example input text: {sample_text}')
#print number of predicting targets
num_pred_targets = len([token for token in first_example['input_ids'] if token not in [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]])
print(f'Num pred targets (cls: {tokenizer.cls_token}, sep: {tokenizer.sep_token}, pad: {tokenizer.pad_token}): {num_pred_targets}')
#print special token in example
special_token_ids = [token for token in first_example['input_ids'] if token in tokenizer.additional_special_tokens_ids]
print(f'Special tokens in input: {tokenizer.convert_ids_to_tokens(ids=special_token_ids)}')
#print labels in example
labels = first_example['token_labels'].tolist()
print(f'Labels {labels}')

Logging run_setup
Logging input sample
First example input text: ['[CLS]', 'neural', 'machine', 'translation', '(', 'nm', '##t', ')', 'has', 'opened', 'several', 'research', 'directions', 'to', 'exploit', 'as', 'many', 'and', 'diverse', 'data', 'as', 'possible', '.', 'massive', 'multil', '##ingual', 'nm', '##t', 'models', ',', 'for', 'instance', ',', 'take', 'advantage', 'of', 'several', 'language', '-', 'pair', 'datasets', 'in', 'a', 'single', 'system', '<', 'target', '_', 'citation', '/', '>', 'this', 'offers', 'several', 'advantages', ',', 'such', 'as', 'a', 'simple', 'training', 'process', 'and', 'enhanced', 'performance', 'of', 'the', 'language', '-', 'pairs', 'with', 'little', 'data', '(', 'although', 'sometimes', 'detrimental', 'to', 'the', 'high', '-', 'resource', 'language', '-', 'pairs', ')', '.', 'however', ',', 'massive', 'models', 'of', 'do', '##zens', 'of', 'languages', 'are', 'not', 'necessarily', 'the', 'best', 'outcome', ',', 'as', 'it', 'is', 'demonstrated', 'that', '

In [8]:
# start training
trainer = CustomTrainer(
    args=args,
    model=ext_model,
    tokenizer=tokenizer,
    train_dataloader=train_dataloader,
    test_dataloader=val_dataloader,
)
trainer.train()


configuring optimizer...

starting training...


{'epoch': 1, 'loss': 2.073, 'acc': [0.579, 0.584], 'macro_f1': 0.0, 'total_f1': 0.0, 'inf_f1': 0.0, 'perc_f1': 0.0, 'back_f1': 0.0}
{'epoch': 2, 'loss': 2.05, 'acc': [0.579, 0.584], 'macro_f1': 0.0, 'total_f1': 0.0, 'inf_f1': 0.0, 'perc_f1': 0.0, 'back_f1': 0.0}
{'epoch': 3, 'loss': 1.838, 'acc': [0.564, 0.563], 'macro_f1': 0.192, 'total_f1': 0.57, 'inf_f1': 0.238, 'perc_f1': 0.157, 'back_f1': 0.18}
{'current_epoch': 4, 'current_step': 100, 'avg_loss': 1.55439, 'max_loss': 2.45031, 'min_loss': 0.77334}
{'epoch': 4, 'loss': 1.569, 'acc': [0.66, 0.662], 'macro_f1': 0.249, 'total_f1': 0.606, 'inf_f1': 0.581, 'perc_f1': 0.09, 'back_f1': 0.076}
{'epoch': 5, 'loss': 1.536, 'acc': [0.664, 0.66], 'macro_f1': 0.268, 'total_f1': 0.662, 'inf_f1': 0.59, 'perc_f1': 0.121, 'back_f1': 0.094}
{'epoch': 6, 'loss': 1.508, 'acc': [0.655, 0.645], 'macro_f1': 0.352, 'total_f1': 0.731, 'inf_f1': 0.622, 'perc_f1': 0.245, 'back_f1': 0.19}
{'epoch': 7, 'loss': 1.494, 'acc': [0.677, 0.669], 'macro_f1': 0.337, '