In [None]:

import os
os.environ['CUDA_VISIBLE_DEVICES']='2'
import json
from datetime import datetime
import sys
import argparse

import torch
from torch.utils.data import DataLoader
from dotenv import load_dotenv

load_dotenv()
FINECITE_PATH = os.getenv('FINECITE_PATH')
if FINECITE_PATH not in sys.path:
    sys.path.append('/home/dataconv/deallab/lasse/CCE')

from finecite import set_seed, DATA_DIR, get_class_weights, CustomDataset, load_processor
from finecite.model import MODEL_DESCRIPTION
from finecite.model import CCAModule
#fix sample extraction (only possible to extract one)

ModuleNotFoundError: No module named 'finecite'

In [2]:
def main():
    
    parser = argparse.ArgumentParser(description='Seq_tagger parser')

    #input arguments
    parser.add_argument('--model_name', required=True, help='scibert llm2vec_mistral llm2vec_llama3')
    parser.add_argument('--dataset', required=True, help='acl-arc, act2, scicite, multicite, finecite')
    parser.add_argument('--task', required=True, help='ext cls')
    parser.add_argument('--ext_type', required=True, help='linear, bilstm, crf, bilstm_crf')
    parser.add_argument('--cls_type', default= 'balanced', help='weighted, balanced, linear, auto_wighted')
    parser.add_argument('--cls_weights', type=list[int], default=None)
    parser.add_argument('--heal_token', default=None)
    

    parser.add_argument('--batch_size', type=int, default=4, help='')
    parser.add_argument('--learning_rate', type=float, default=2e-05, help='')
    parser.add_argument('--dropout', type=float, default=0.0, help='')

    parser.add_argument('--save_model', type=bool, defualt=True, help='')
    parser.add_argument('--debug', action='store_true', help='')
    parser.add_argument('--debug_size', type=int, default=5, help='')
    parser.add_argument("--seed", type=int, default=4455, help='')
    args = parser.parse_args()

In [3]:
parser = argparse.ArgumentParser(description='Seq_tagger parser')
args = parser.parse_args([])
args.model_name = 'scibert' # scibert llm2vec_mistral llm2vec_llama3 modern_bert# scibert 4, 2e-05, 0.1
args.dataset = 'd-act2' # 'acl-arc', 'act2', 'scicite', 'multicite', finecite
args.task = 'cls' # 'ext', 'cls'
args.ext_type = 'linear' # linear, bilstm, crf, bilstm_crf
args.cls_type = 'linear' # weighted, balanced, linear, 'auto_wighted total #best is total and linear, auto weight shows importance of scopes
args.cls_weights = [1, 2, 0]
args.heal_token = 'word' # phrase, word

args.batch_size = 4
args.learning_rate = 2e-05
args.crf_learning_rate = 0.005
args.dropout = 0.1

args.save_model = False
args.cached_data = True
args.debug = True
args.debug_size = None
args.seed = 4455

In [4]:
#static arguments
args.max_epochs = 20
args.patients = 3
args.adam_epsilon = 1e-08
args.weight_decay = 0.0
args.use_prompt = None#args.model_name != 'scibert'
args.max_len = 512 if args.model_name == 'scibert' else 740
args.dtype = torch.float32

# model description
args.model_desc = MODEL_DESCRIPTION[args.model_name]

# input directory
args.input_dir = f'{DATA_DIR}/model_training/{args.dataset}/'

# output directory
if args.debug:
    args.output_dir = f"./output/_debug/{args.dataset}/{args.model_name}/{args.ext_type}_{args.batch_size}_{args.learning_rate}_{args.dropout}_{datetime.now().strftime('%m_%d_%H_%M_%S')}/"
else: 
    args.output_dir = f"./output/{args.dataset}/{args.model_name}/{args.ext_type}_{args.batch_size}_{args.learning_rate}_{args.dropout}_{datetime.now().strftime('%m_%d_%H_%M_%S')}/"
os.makedirs(args.output_dir, exist_ok=True)
if args.save_model:
    args.model_output_dir = f"{DATA_DIR}/model_training/output/{args.dataset}/{args.model_name}/{args.batch_size}_{args.learning_rate}_{args.dropout}_{datetime.now().strftime('%m_%d_%H_%M_%S')}/"
    os.makedirs(args.model_output_dir, exist_ok=True)

# model cache dir
if args.model_name != 'scibert':
    args.base_model_dir =  f'{DATA_DIR}/model_training/llm2vec_models/{args.model_name}/'
    os.makedirs(args.base_model_dir, exist_ok=True)
    
#data cache dir
if args.task == 'cls':
    args.chache_dir =  f'{DATA_DIR}/.cache/{args.dataset}/'
    os.makedirs(args.chache_dir, exist_ok=True)

# model input dir
args.trained_model_dir = f'{DATA_DIR}/model_training/output/finecite/{args.model_name}/{args.ext_type}'  #if args.dataset != 'finecite' else None

set_seed(args.seed)
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
#load model and tokenizer
print('loading model...')
model = CCAModel(args)
model.load_pretrained()
tokenizer = model.tokenizer
print('done')

loading model...
loading scibert model from /raid/deallab/CCE_Data/model_training/output/finecite/scibert/linear
done


In [6]:
#load data processor
processor = load_processor(args)
dataset = CustomDataset(args, tokenizer)

# load data
train_data = processor.load_train_data()
test_data = processor.load_test_data()

# create dataset
train_ds, weights1, num_labels = dataset.load_data(train_data)
test_ds, weights2, num_labels = dataset.load_data(test_data)
weights = [(w1 * len(train_ds) + w2 * len(test_ds)) / (len(train_ds) + len(test_ds)) for w1, w2 in zip(weights1, weights2)]
num_training_steps = int(len(train_data) / args.batch_size) * args.max_epochs
model.configurate(weights, num_labels, num_training_steps)

# add context labels if cls
if args.task == 'cls':
    train_file = f'{args.model_name}_{args.ext_type}_{args.heal_token}_train.pt'
    test_file = f'{args.model_name}_{args.ext_type}_{args.heal_token}_test.pt'
    # check cached examples
    if args.cached_data and train_file in os.listdir(args.chache_dir) and test_file in os.listdir(args.chache_dir):
        train_ds = torch.load(os.path.join(args.chache_dir, train_file), weights_only=False)
        test_ds = torch.load(os.path.join(args.chache_dir, test_file), weights_only=False)
    else:
    
        # dataloader
        train_dl = DataLoader(train_ds, batch_size = args.batch_size, shuffle=False)
        test_dl = DataLoader(test_ds, batch_size = args.batch_size, shuffle=False)
        
        #extract context
        train_lbls = model.extract(train_dl)
        test_lbls = model.extract(test_dl)
        
        #add context to dataset
        train_ds = dataset.add_context_lbls(train_ds, train_lbls)
        test_ds = dataset.add_context_lbls(test_ds, test_lbls)
        
        #cache data
        torch.save(train_ds, os.path.join(args.chache_dir, train_file))
        torch.save(test_ds, os.path.join(args.chache_dir, test_file))
        
    #reload model from_pretrained
    model.reload()

print(len(train_ds), len(test_ds))

#Dataloader
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=args.batch_size, num_workers=0) 
val_dataloader =  DataLoader(test_ds, shuffle=True, batch_size=args.batch_size, num_workers=0)

[0.33646112600536193, 1.5987261146496816, 2.9186046511627906, 11.952380952380953, 1.5886075949367089, 0.742603550295858]
[0.29733163913595934, 1.3371428571428572, 2.3168316831683167, 7.8, 1.786259541984733, 1.3]
adding classifier...
configuring optimizer...
loading scibert model...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


adding classifier...
configuring optimizer...
1506 1404


In [7]:
# log model setup
print(f'Logging run_setup')
print_args = {k: str(v) for k,v in vars(args).items()}
with open(os.path.join(args.output_dir, f'run_setup.json'), 'w') as f_out:
    json.dump(print_args, f_out, indent=4)

# log imput sample
print(f'Logging input sample')
input_sample = [tokenizer.convert_ids_to_tokens(ids=train_ds[i]['input_ids']) for i in range(3)]

with open(os.path.join(args.output_dir, f'input_sample.json'), 'w') as f_out:
    json.dump(input_sample, f_out, indent=4)
    
# print sample text
first_example = train_ds[0]
sample_text = tokenizer.convert_ids_to_tokens(ids=first_example['input_ids'])
print(f'First example input text: {sample_text}')
#print number of predicting targets
num_pred_targets = len([token for token in first_example['input_ids'] if token not in [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]])
print(f'Num pred targets (cls: {tokenizer.cls_token}, sep: {tokenizer.sep_token}, pad: {tokenizer.pad_token}): {num_pred_targets}')
#print special token in example
special_token_ids = [token for token in first_example['input_ids'] if token in tokenizer.additional_special_tokens_ids]
print(f'Special tokens in input: {tokenizer.convert_ids_to_tokens(ids=special_token_ids)}')
#print labels in example
labels = first_example['tok_lbl'].tolist()
print(f'Labels {labels}')
if 'cls' in args.task:
    labels = first_example['int_lbl'].tolist()
    print(f'Labels {labels}')

Logging run_setup
Logging input sample
First example input text: ['[CLS]', 'an', 'important', 'issue', 'is', 'how', 'the', 'vortex', 'shedding', 'is', 'modified', 'when', 'a', 'magnetic', 'field', 'is', 'present', '.', 'full', '##scale', 'numerical', 'simulations', 'of', 'emerging', 'magnetic', 'flux', 'tubes', 'have', 'demonstrated', 'the', 'importance', 'of', 'this', 'effect', 'and', 'the', 'oscillating', '(', '"', 'zig', '##za', '##ging', '"', ')', 'trajectory', 'of', 'the', 'tube', '<', 'citation', '/', '>', '.', '<', 'citation', '/', '>', 'simulated', 'the', 'interaction', 'of', 'a', 'cylinder', 'with', 'a', 'magnet', '##ised', 'flow', '.', 'it', 'was', 'found', 'that', 'for', 'the', 'field', 'parallel', 'to', 'the', 'axis', 'of', 'the', 'cylinder', ',', 'the', 'stro', '##uh', '##al', 'number', 'was', 'about', '0', '.', '2', '.', 'thus', ',', 'it', 'can', 'be', 'taken', 'that', ',', 'at', 'least', 'in', 'the', 'case', 'when', 'the', 'external', 'magnetic', 'field', '(', 'b', '0', 

In [8]:
# start training
print('\nstarting training...')
model.to(args.device)
args.best_value = 0
args.best_value_epoch = 0
best_val_res = {}
best_output_samples = ''

for epoch in range(args.max_epochs):
    model.epoch()
    model.train_epoch(train_dataloader)
    val_metric, val_res, output_samples = model.evaluate(val_dataloader)
    if val_metric > args.best_value:
        args.best_value = val_metric
        args.best_value_epoch = epoch
        best_val_res = val_res
        best_output_samples = output_samples
        if args.save_model:
            model.save_pretrained(args.model_output_dir)
    else:
        if epoch >= args.best_value_epoch + args.patients:
            break
        
print(f'Logging validation scores for best epoch')
with open(os.path.join(args.output_dir, f'best_scores.json'), 'w') as f_out:
        json.dump(best_val_res, f_out, indent=4)
        
print(f'Logging best output samples')
with open(os.path.join(args.output_dir, f'output_samples.txt'), 'w') as f_out:
        f_out.write(best_output_samples)




starting training...
{'current_epoch': 1, 'current_step': 100, 'avg_loss': 1.81595, 'max_loss': 8.33996, 'min_loss': 0.55976}
{'current_epoch': 1, 'current_step': 200, 'avg_loss': 1.62481, 'max_loss': 5.70148, 'min_loss': 0.57726}
{'current_epoch': 1, 'current_step': 300, 'avg_loss': 1.91373, 'max_loss': 7.42978, 'min_loss': 0.58009}
{'epoch': 1, 'loss': 1.895, 'macro_f1': 0.087, 'micro_f1': 0.109, 'precision': [0.0, 0.167, 0.082, 0.0, 0.089, 0.23], 'recall': [0.0, 0.017, 0.109, 0.0, 0.71, 0.256], 'f1': [0.0, 0.031, 0.094, 0.0, 0.158, 0.242]}
{'current_epoch': 2, 'current_step': 400, 'avg_loss': 1.68692, 'max_loss': 4.64877, 'min_loss': 0.67063}
{'current_epoch': 2, 'current_step': 500, 'avg_loss': 1.68206, 'max_loss': 6.68931, 'min_loss': 0.57771}
{'current_epoch': 2, 'current_step': 600, 'avg_loss': 1.59016, 'max_loss': 6.74469, 'min_loss': 0.47892}
{'current_epoch': 2, 'current_step': 700, 'avg_loss': 1.57176, 'max_loss': 6.42027, 'min_loss': 0.57718}
{'epoch': 2, 'loss': 1.89, 'ma

In [9]:
# starting training...
# {'epoch': 1, 'loss1': [3.3438, 2.8906, 0.4453], 'loss2': [3.5312, 3.1094, 0.4277], 'acc': [0.316, 0.308], 'macro_f1': [0.181, 0.169], 'total_f1': [0.568, 0.545], 'inf_f1': [0.285, 0.239], 'perc_f1': [0.149, 0.158], 'backg_f1': [0.11, 0.111], 'inf_emb': [0.23828125, 0.23828125], 'perc_emb': [0.33984375, 0.33984375], 'back_emb': [0.265625, 0.36328125]}
# {'epoch': 2, 'loss1': [2.9688, 2.5781, 0.3809], 'loss2': [3.0312, 2.6719, 0.3633], 'acc': [0.406, 0.416], 'macro_f1': [0.216, 0.218], 'total_f1': [0.577, 0.565], 'inf_f1': [0.327, 0.321], 'perc_f1': [0.148, 0.172], 'backg_f1': [0.173, 0.162], 'inf_emb': [0.23046875, 0.23828125], 'perc_emb': [0.328125, 0.23046875], 'back_emb': [0.1875, 0.22265625]}
# {'epoch': 3, 'loss1': [2.9062, 2.5625, 0.3535], 'loss2': [2.875, 2.5469, 0.334], 'acc': [0.456, 0.476], 'macro_f1': [0.231, 0.248], 'total_f1': [0.578, 0.572], 'inf_f1': [0.358, 0.386], 'perc_f1': [0.192, 0.206], 'backg_f1': [0.144, 0.153], 'inf_emb': [0.1796875, 0.1953125], 'perc_emb': [1.0, 1.0], 'back_emb': [0.203125, 0.1875]}


#ACT2
# scibert - 'macro_f1': 0.2815, 'micro_f1': 0.4059,
# finecite scibert w pre_cls - 'macro_f1': 0.3, 'micro_f1': 0.421

# scibert + citance {'epoch': 8, 'loss': 1.876, 'macro_f1': 0.228, 'micro_f1': 0.342, 'precision': [0.558, 0.292, 0.03, 0.079, 0.191, 0.319], 'recall': [0.421, 0.067, 0.026, 0.538, 0.22, 0.576], 'f1': [0.48, 0.109, 0.028, 0.137, 0.205, 0.411]}
# pretrained scibert + citance {'epoch': 11, 'loss': 2.344, 'macro_f1': 0.257, 'micro_f1': 0.312, 'precision': [0.56, 0.344, 0.137, 0.2, 0.082, 0.282], 'recall': [0.311, 0.212, 0.333, 0.231, 0.119, 0.554], 'f1': [0.4, 0.262, 0.194, 0.214, 0.097, 0.374]}
# finecite scibert balanced (bs 8, ls 3e-05) {'epoch': 8, 'loss': 2.715, 'macro_f1': 0.299, 'micro_f1': 0.371, 'precision': [0.586, 0.344, 0.189, 0.444, 0.036, 0.3], 'recall': [0.388, 0.308, 0.179, 0.308, 0.034, 0.696], 'f1': [0.467, 0.325, 0.184, 0.364, 0.035, 0.42]}
