In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
import torch
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from sklearn.metrics import f1_score
from statistics import mean
from tqdm import tqdm
import json
import re

from prompts import PromptForAutoCCA
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL = 'mistral'
DATA = 'acl_arc' # finecite, multicite
SCHEMA = 'JSON1' # XML, JSON1, JSON2

INPUT = f'./data/{DATA}/{MODEL}/{SCHEMA}/'
OUTPUT = f'./output/{DATA}/{MODEL}/{SCHEMA}/'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_mapping = {
    "mistral":'mistralai/Mistral-7B-Instruct-v0.3',
}
model_id = model_mapping[MODEL]

max_seq_length = 1024

In [3]:
#tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

#model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

LMmodel = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map ='auto'
)
LMmodel.resize_token_embeddings(len(tokenizer))

peft_config = LoraConfig(target_modules=[ "v_proj", "q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj" ], inference_mode=False, r=4, lora_alpha=32, lora_dropout=0.1)

LMmodel = get_peft_model(LMmodel, peft_config)

LMmodel.print_trainable_parameters()

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.96s/it]


trainable params: 10,485,760 || all params: 7,258,509,312 || trainable%: 0.1445


In [4]:
# load data
with open(INPUT + 'train.json', 'r') as file:
    train_data = json.load(file)
    
with open(INPUT + 'test.json', 'r') as file:
    test_data = json.load(file)

# Convert the DataFrame to a Dataset
train_ds = Dataset.from_list(train_data[:5])
test_ds = Dataset.from_list(test_data[:5])

# initialize prompt class

prompt = PromptForAutoCCA(tokenizer, DATA, SCHEMA)

#Apply the tokenization function to the dataset
train_ds = train_ds.map(
    lambda row: prompt.create_sample(row['input'], row['output']), 
    batched=False, 
    remove_columns=train_ds.column_names
)

dev_ds = test_ds.map(
    lambda row: prompt.create_sample(row['input'], row['output']), 
    batched=False, 
    remove_columns=test_ds.column_names
)

eval_ds = test_ds.map(
    lambda row: prompt.create_sample(row['input'], row['output'], for_generation=True), 
    batched=False, 
)


Map: 100%|██████████| 5/5 [00:00<00:00, 151.39 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 1026.76 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 830.98 examples/s]


In [5]:
# Define Data Collator
class CustomDataCollator:
    def __init__(self, tokenizer, padding, max_length):
        self.tokenizer = tokenizer
        self.padding = padding
        self.max_length = max_length

    def __call__(self, features):
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            return_tensors='pt',
        )
        labels = batch["input_ids"].clone()
        
        # Compute loss mask for output token only
        for i in range(batch['input_ids'].shape[0]):
            
            # Decode whole sample
            text_content = self.tokenizer.decode(batch['input_ids'][i][1:])  
            
            # Extract output boundary
            output_boundary = text_content.rfind("[/INST]") + len("[/INST]")
            prompt_text = text_content[:output_boundary]
            
            # tokenize promt text
            prompt_text_tokenized = self.tokenizer(
                prompt_text,
                return_overflowing_tokens=False,
                return_length=False,
            )
            # get length of promt text
            promt_text_len = len(prompt_text_tokenized['input_ids'])
            
            # set loss mask for promt text
            labels[i][range(promt_text_len)] = -100
            
                    
        batch["labels"] = labels
        return batch

# init data collator
data_collator=CustomDataCollator(
    tokenizer=tokenizer, 
    padding="longest", 
    max_length=max_seq_length, 
)

In [6]:
# load trainer
training_arguments = TrainingArguments(
    output_dir=OUTPUT,
    eval_strategy = 'epoch',
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=1e-4,
    num_train_epochs = 3,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps = 5,
    logging_dir="./logs",
    save_strategy = 'epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    label_names = ['labels'],
)
trainer = Trainer(
    model=LMmodel,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    args=training_arguments,
    data_collator=data_collator,
)

In [7]:
#trainer.train()

In [8]:
import importlib
importlib.reload(evaluate)

evaluator = evaluate.Evaluator(trainer.model, tokenizer, 20, eval_ds, DATA, SCHEMA, DEVICE)
evaluator.evaluate()

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:05<00:00,  1.20s/it]

[' {"label": ["COMPARE_CONTRAST"]}', ' {"label": ["BACKGROUND"]}', ' {"label": ["COMPARE_CONTRAST"]}', ' {"label": ["USE"]}', ' {"label": ["COMPARE_CONTRAST"]}']
COMPARE_CONTRAST
BACKGROUND
COMPARE_CONTRAST
USE
COMPARE_CONTRAST





In [9]:
# from transformers import logging
# logging.set_verbosity_error()

# class Evaluator:
#     def __init__(
#         self,
#         model,
#         tokenizer,
#         max_new_tokens,
#         eval_dataset,
#         task,
#         schema,
#         cuda_device,
#     ):
#         self.model = model
#         self.tokenizer = tokenizer
#         self.max_new_tokens = max_new_tokens
#         self.eval_dataset = eval_dataset
#         self.task = task
#         self.schema = schema
#         self.device = cuda_device
        
#         model.to(self.device)

#     def evaluate(self):
#         predictions = []
#         for sample in self.eval_dataset:
#             gold_label = sample['gold']['label'][0]
#             input_ids = sample['input_ids']     
                   
#             #generate task completion
#             output = self.generate(input_ids)
#             print(output)
            
#             #evaluate structure
#             struc_eval = self.evaluate_structure(output)   
#             print(struc_eval)
            
#             # add to predictions if ok
#             if struc_eval: 
#                 json_dict = json.loads(output)
#                 predictions.append({
#                     'gold':gold_label,
#                     'pred': json_dict['label']
#                 })
            
#         #qualitative eval
#         qual_eval = self.evaluate_quality(predictions)
#         print(qual_eval)
    
#     def generate(self, input_ids):
#         input_ids = torch.tensor([input_ids]).to(self.device) 
#         res = self.model.generate(input_ids, max_new_tokens = self.max_new_tokens)
#         output = tokenizer.decode(res[0]).split('[/INST]')[-1]
#         output = re.sub('</s>', '', output)
#         return output
    
#     def evaluate_structure(self, output):
#         #check if json
#         is_valid_json = self.is_json(output)
#         if not is_valid_json: return False
        
#         json_dict = json.loads(output)
#         #check if has correct keys
#         has_expected_keys = self.has_keys(json_dict, ['label'])
#         if not has_expected_keys: return False
        
#         #check if class assigned labels exist
#         has_exisiting_classes = self.class_exists([json_dict['label']], ['BACKGROUND','MOTIVATION','COMPARE_CONTRAST','USE'])
#         return has_exisiting_classes
        
    
#     def is_json(self, input_json):
#         try:
#             json.loads(input_json)
#         except ValueError as e:
#             return False
#         return True
    
#     def has_keys(self, input_json, keys):
#         for key in keys:
#             if key not in input_json.keys():
#                 return False
#         return True
    
#     def class_exists(self, class_list, expected_classes):
#         for cls in class_list:
#             print(cls)
#             if cls not in expected_classes:
#                 return False
#         return True
    
#     def evaluate_quality(self, preds):
#         accuracy = mean([1 if sample['gold'] == sample['pred'] else 0 for sample in preds])
#         return accuracy
    


# def calculate_metrics(eval_df):
#     valid_json = sum(eval_df['is_valid_json']) / len(eval_df)
#     eval_df.dropna(inplace=True)
#     macro_f1 = f1_score([int(no) for no in eval_df['label']], [int(no) for no in eval_df['label_pred']], average='macro')
#     micro_f1 = f1_score([int(no) for no in eval_df['label']], [int(no) for no in eval_df['label_pred']], average='micro')
#     return valid_json, micro_f1, macro_f1