In [1]:
import os
import importlib
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
import torch
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from sklearn.metrics import f1_score
from statistics import mean
from tqdm import tqdm
import json
import re

import prompts
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL = 'mistral'
DATA = 'multicite' #acl_arc, finecite, multicite
SCHEMA = 'XML' # XML, JSON1, JSON2

INPUT = f'./data/{DATA}/{MODEL}/{SCHEMA}/'
OUTPUT = f'./output/{DATA}/{MODEL}/{SCHEMA}/'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_mapping = {
    "mistral":'mistralai/Mistral-7B-Instruct-v0.3',
}
model_id = model_mapping[MODEL]

max_seq_length = 1024

In [3]:
#tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

#model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

LMmodel = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map ='auto'
)
LMmodel.resize_token_embeddings(len(tokenizer))

peft_config = LoraConfig(target_modules=[ "v_proj", "q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj" ], inference_mode=False, r=4, lora_alpha=32, lora_dropout=0.1)

LMmodel = get_peft_model(LMmodel, peft_config)

LMmodel.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# load data
with open(INPUT + 'train.json', 'r') as file:
    train_data = json.load(file)
    
with open(INPUT + 'test.json', 'r') as file:
    test_data = json.load(file)

# Convert the DataFrame to a Dataset
train_ds = Dataset.from_list(train_data)
test_ds = Dataset.from_list(test_data[:20])

# initialize prompt class

prompt = prompts.PromptForAutoCCA(tokenizer, DATA, SCHEMA)

#Apply the tokenization function to the dataset
train_ds = train_ds.map(
    lambda row: prompt.create_sample(row['input'], row['output']), 
    batched=False, 
    remove_columns=train_ds.column_names
)

dev_ds = test_ds.map(
    lambda row: prompt.create_sample(row['input'], row['output']), 
    batched=False, 
    remove_columns=test_ds.column_names
)

eval_ds = test_ds.map(
    lambda row: prompt.create_sample(row['input'], row['output'], for_generation=True), 
    batched=False, 
)


Map:   0%|          | 0/844 [00:00<?, ? examples/s]

Map: 100%|██████████| 844/844 [00:00<00:00, 1026.92 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 894.23 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 1363.89 examples/s]


In [None]:
# Define Data Collator
class CustomDataCollator:
    def __init__(self, tokenizer, padding, max_length):
        self.tokenizer = tokenizer
        self.padding = padding
        self.max_length = max_length

    def __call__(self, features):
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            return_tensors='pt',
        )
        labels = batch["input_ids"].clone()
        
        # Compute loss mask for output token only
        for i in range(batch['input_ids'].shape[0]):
            
            # Decode whole sample
            text_content = self.tokenizer.decode(batch['input_ids'][i][1:])  
            
            # Extract output boundary
            output_boundary = text_content.rfind("[/INST]") + len("[/INST]")
            prompt_text = text_content[:output_boundary]
            
            # tokenize promt text
            prompt_text_tokenized = self.tokenizer(
                prompt_text,
                return_overflowing_tokens=False,
                return_length=False,
            )
            # get length of promt text
            promt_text_len = len(prompt_text_tokenized['input_ids'])
            
            # set loss mask for promt text
            labels[i][range(promt_text_len)] = -100
            
                    
        batch["labels"] = labels
        return batch

# init data collator
data_collator=CustomDataCollator(
    tokenizer=tokenizer, 
    padding="longest", 
    max_length=max_seq_length, 
)

In [None]:
# load trainer
training_arguments = TrainingArguments(
    output_dir=OUTPUT,
    eval_strategy = 'epoch',
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=1e-4,
    num_train_epochs = 3,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps = 50,
    logging_dir="./logs",
    save_strategy = 'epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    label_names = ['labels'],
)
trainer = Trainer(
    model=LMmodel,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    args=training_arguments,
    data_collator=data_collator,
)

In [None]:
#trainer.train()

In [None]:
import importlib
importlib.reload(evaluate)

evaluator = evaluate.Evaluator(trainer.model, tokenizer, eval_ds, DATA, SCHEMA, DEVICE)
evaluator.evaluate(test_data=eval_ds['output'])

['There is a person sitting on a horse. he is holding a horse thread and he is wearing a cap. there are flags, board on the left side. we can see in the background sky, trees. Contrastive Learning Recently, contrastive learning has been widely studied in unsupervised representation learning for vision, #REF , language #REF , or multi-modal #REF . The goal is to learn semantic representation between two views by allowing the positive sample to be similar (in semantic space) and negatives to be dissimilar semantically simultaneously. CLIP #REF and MIL-NCE #TARGET_REF has demonstrated the effectiveness for learning the semantic mapping between vision and language. Previous attempts mainly exploit the InfoNCE #REF objective to maximize a lower bound of the mutual information. This paper extends the multimodal contrastive learning between the trace in the image and captioning sentence. In the same image, they correspond to each other semantically. This motivates us to design a contrastive l

In [None]:
re.match(r'<([^\/]+?)>', '<TAG>').group(1)

'TAG'