In [1]:
import pandas as pd
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
from trl import SFTTrainer, SFTConfig
import torch
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from sklearn.model_selection import train_test_split

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_id = 'mistralai/Mistral-7B-Instruct-v0.3'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load data
cce_df = pd.read_csv("./data/finecite/full_data.csv")

In [3]:
def label_mapping(label):
    if label == 1: return 'INFORMATION'
    if label == 2: return 'PERCEPTION'
    if label == 3: return 'BACKBROUND'

def replace_ref(word):
    if re.match(r'single', word):
        return '[REF]'
    else:
        return '[GREF]'


# Process the DataFrame
results = []
for index, row in cce_df.iterrows():
    
    #clean all ';' from references
    clean_paragraph = re.sub(r'<ref.*?>.*?</ref>',lambda x: x.group().replace(';',','), row['paragraph'])
    
    #create word list of par
    words_par = clean_paragraph.split(';')
    
    #replace (G)TREF
    words_par[row['target_reference_location']] = '[TREF]' if re.search(r'single', words_par[row['target_reference_location']]) else '[GTREF]'
    
    # Clean the paragraph by replacing <ref> tags with '[TREF]'
    words_par = [replace_ref(word) if re.search(r'<ref.*?>.*?</ref>', word) else word for word in words_par]
    

    # Process the context_location1 list
    context_location1 = eval(row["context_location1"])

    # check whether context label and word list are same length
    assert len(context_location1) == len(words_par), f'The labels are of length {len(context_location1)}, while the word list is of length {len(words_par)}'
    
    # Check if the lengths match, and map the context_location1 to the words
    if len(context_location1) == len(words_par):
        # Aggregate the mapped results for the current row
        mapped_result = list(zip(context_location1, words_par))
        
        # Separate the numbers and words into separate lists
        numbers = [item[0] for item in mapped_result]  # Convert numbers to strings
        mapped_words = [item[1].strip() for item in mapped_result]  # Strip extra spaces from words

        sem_structured_context = ''
        staged_context = []
        prev_label = 0
        for label, context in mapped_result:
            if label != prev_label and prev_label != 0:
                sem_structured_context += f"<{label_mapping(prev_label)}> {' '.join(staged_context)} </{label_mapping(prev_label)}>"
                staged_context = []
            if label != 0: staged_context.append(context)
            prev_label = label
                
            
        
        
        results.append({
            "Paragraph": ' '.join(mapped_words),
            "Scope": numbers,
            "Sem_struc_context": str(sem_structured_context)
        })
    else:
        results.append({
            "Paragraph": "Length of context_location1 and words don't match",
            "Scope": "Mismatch",
            "Sem_struc_context": 'Mismatch'
        })

# Convert results to DataFrame
df = pd.DataFrame(results)
df

Unnamed: 0,Paragraph,Scope,Sem_struc_context
0,Neural Machine Translation (NMT) has opened se...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",<BACKBROUND> Neural Machine Translation (NMT) ...
1,"As shown in Table 1, the size of the 'in-domai...","[0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",<BACKBROUND> the size of the 'in-domain' TED t...
2,Automatic extraction of events has gained siza...,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",<PERCEPTION> Automatic extraction of events ha...
3,The subject NP 'Bill' is coindexed with the tr...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",<BACKBROUND> The subject NP 'Bill' is coindexe...
4,Self-training [GTREF] ) uses a source-to-targe...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<INFORMATION> Self-training [GTREF] ) uses a s...
...,...,...,...
1050,"For the final-stage neural reranker, we experi...","[0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, ...",<BACKBROUND> BERT-large and T5-base </BACKBROU...
1051,Trained on 20GB texts of both Vietnamese news ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",<BACKBROUND> ViBERT was trained on 60GB texts ...
1052,Pretraining Corpus: Following the E2E pretrain...,"[0, 0, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, ...",<PERCEPTION> Following </PERCEPTION><INFORMATI...
1053,The nouns are organized as an inheritance syst...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 3, 3, 3, ...",<INFORMATION> The nouns are organized as an in...


In [25]:
def get_fine_tune_prompt_xml( 
    input_str: str,
    label_str: str,
    tokenizer,
) -> torch.Tensor:

    """
    Args:
    rule_set (List[str]): List of strings representing entity labels and its
                          corresponding description
    input_str (str): Actual input string on which detections need to be
                     performed
    label_str (str): Expected output string corresponding to input_str
    tokenizer (PreTrainedTokenizerBase): A tokenizer corresponding to the model
                                         being fine-tuned

    Returns: 
    torch.Tensor: Tensor of tokenized input ids
    """

    usr_msg1 = "You are given a excerpt from a scientific text including citation marker, marked as [REF], [GREF], [TREF], and [GTREF]." \
        "[REF] is a placehoder for a single reference, [GREF] for multiple references, and [TREF] or [GTREF] are the references targeted for annotation" \
        "You are further given a list of context types, which discribe different information entities in relation to the reference marker. " \
        "Your task is to detect and identify all instances of the supplied context types of the as target marked reference in the provided text. " \
        "The output should only contain the the citation context, marked with the three context types" \
        "Each span of text that belongs to a certain context type should be enclosed with the XML-tag" \
        "For example, a span of thext belonging to INFORMATION should be enclosed within <INFORMATION></INFORMATION> tags." \
        "Ensure that all context spans are identified. Do not perform false identifications." \
        f"""\n\nList Of Entities\n
        INFORMATION: content from the cited paper (what is cited?)
        PERCEPTION: perception or use of the cited content (how is cited/used?)
        BACKGROUND: backgound of the citation (why is it cited?)
        """\
        "\n\n" \
        "Are the instructions clear to you?"
    
    asst_msg1 = "Yes, the instructions are clear to me."\
                # "To clarify, as you have provide me with an excerpt from a scientific text, my task is as follows:"\
                # "Identify all instances of the provided reference markers [REF], [GREF], [TREF], and [GTREF] in the text."\
                # "Detect and identify all instances of the specified context types INFORMATION, PERCEPTION, and BACKGROUND related to the targeted reference mark"\
    
    usr_msg2 = "Neural Machine Translation (NMT) has opened several research directions to exploit as many and diverse data as possible. Massive multilingual NMT models, for instance, take advantage of several language-pair datasets in a single system [TREF] . This offers several advantages, such as a ..."

    asst_msg2 = "<BACKBROUND> Neural Machine Translation (NMT) has opened several research directions to exploit as many and diverse data as possible. </BACKBROUND><INFORMATION> Massive multilingual NMT models, for instance, take advantage of several language-pair datasets in a single system [TREF] . </INFORMATION>"

    usr_msg3 = "Give a brief explanation of why your answer is correct."

    asst_msg3 = "The provided text discusses the advantages and limitations of massive multilingual NMT models." \
                "The context type 'BACKGROUND' is used to explain the purpose of these models, which is to exploit as many and diverse data as possible."\
                "The context type 'INFORMATION' is used to provide specific details about the massive multilingual NMT models, such as the fact that they take advantage of several language-pair datasets in a single system."\
                "The reference marker [TREF] is used to cite the source of this information. The context type 'PERCEPTION' is not used in this text, as it refers to the perception or use of the cited content, and in this case, there is no explicit mention of how the information is being used or perceived"
    
    usr_msg4 = "Great! I am now going to give you another user utterance. Please detect the context types in it " \
                "according to the previous instructions. Do not include an explanation in your answer."
    
    asst_msg4 = "Sure! Please give me the user utterance."

    messages = [
        {"role": "user", "content": usr_msg1},
        {"role": "assistant", "content": asst_msg1},
        # {"role": "user", "content": usr_msg2},
        # {"role": "assistant", "content": asst_msg2},
        # {"role": "user", "content": usr_msg3},
        # {"role": "assistant", "content": asst_msg3},
        # {"role": "user", "content": usr_msg4},
        # {"role": "assistant", "content": asst_msg4},
        {"role": "user", "content": input_str},
        #{"role": "assistant", "content": label_str},
    ]
    encoded_input_ids = tokenizer.apply_chat_template(messages)

    return {'input_ids': encoded_input_ids}

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

LMmodel = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    torch_dtype = torch.bfloat16,
    device_map = 'auto'
)

peft_config = LoraConfig(target_modules=[ "v_proj", "q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj" ], inference_mode=False, r=4, lora_alpha=32, lora_dropout=0.1)

LMmodel = get_peft_model(LMmodel, peft_config)

LMmodel.print_trainable_parameters()

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.99s/it]


trainable params: 10,485,760 || all params: 7,258,509,312 || trainable%: 0.1445


In [7]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

@dataclass
class CustomDataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs received.

    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:

            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
              sequence is provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.

            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        return_tensors (`str`, *optional*, defaults to `"pt"`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        labels = batch["input_ids"].clone()
        
        # Set loss mask for all pad tokens
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        # Compute loss mask for appropriate tokens only
        for i in range(batch['input_ids'].shape[0]):
            
            # Decode the training input
            text_content = self.tokenizer.decode(batch['input_ids'][i][1:])  # slicing from [1:] is important because tokenizer adds bos token
            
            # Extract substrings for prompt text in the training input
            # The training input ends at the last user msg ending in [/INST]
            prompt_gen_boundary = text_content.rfind("[/INST]") + len("[/INST]")
            prompt_text = text_content[:prompt_gen_boundary]
            
            # print(f"""PROMPT TEXT:\n{prompt_text}""")
            
            # retokenize the prompt text only
            prompt_text_tokenized = self.tokenizer(
                prompt_text,
                return_overflowing_tokens=False,
                return_length=False,
            )
            # compute index where prompt text ends in the training input
            prompt_tok_idx = len(prompt_text_tokenized['input_ids'])
            
            # Set loss mask for all tokens in prompt text
            labels[i][range(prompt_tok_idx)] = -100
            
            # print("================DEBUGGING INFORMATION===============")
            # for idx, tok in enumerate(labels[i]):
            #     token_id = batch['input_ids'][i][idx]
            #     decoded_token_id = self.tokenizer.decode(batch['input_ids'][i][idx])
            #     print(f"""TOKID: {token_id} | LABEL: {tok} || DECODED: {decoded_token_id}""")
                    
        batch["labels"] = labels
        return batch

In [26]:
# split data in train / test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=96, shuffle=True)

# Convert the DataFrame to a Dataset
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

#Apply the tokenization function to the dataset
# train_ds = train_ds.map(
#     lambda row: get_fine_tune_prompt_xml(row['Paragraph'], row['Sem_struc_context'], tokenizer), 
#     batched=False, 
#     remove_columns=train_ds.column_names  # Remove all original columns
# )

test_ds = test_ds.map(
    lambda row: get_fine_tune_prompt_xml(row['Paragraph'], row['Sem_struc_context'], tokenizer), 
    batched=False, 
    remove_columns=test_ds.column_names  # Remove all original columns
)

Map:   0%|          | 0/211 [00:00<?, ? examples/s]

Map: 100%|██████████| 211/211 [00:00<00:00, 1961.75 examples/s]


In [9]:
max_seq_length = 800
training_arguments = SFTConfig(
    output_dir="./tmp",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    weight_decay=0.01,
)
trainer = SFTTrainer(
    model=LMmodel,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    dataset_text_field="input_ids",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    # Using custom data collator inside SFTTrainer
    data_collator=CustomDataCollatorWithPadding(
        tokenizer=tokenizer, 
        padding="longest", 
        max_length=max_seq_length, 
        return_tensors="pt"
    )
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [10]:
data_collator=CustomDataCollatorWithPadding(
    tokenizer=tokenizer, 
    padding="longest", 
    max_length=max_seq_length, 
    return_tensors="pt"
)
res = trainer.model.generate(data_collator([test_ds[0]])['input_ids'], max_new_tokens=512)
tokenizer.decode(res[0]).split('[/INST]')[-1]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


' <BACKBROUND> FastText embeddings are also used for Hindi, French, Spanish </BACKBROUND><INFORMATION> Italian word representations </INFORMATION></s>\n\nThe bi-LSTM model is trained on a fixed 300 hidden dimensions for all the bi-LSTMs in the architecture [GREF]. The model is trained on a dataset of 100,000 sentences [TREF]. The dataset is a combination of scientific papers and news articles [GREF]. The dataset is split into training, validation, and test sets with a ratio of 80:10:10 respectively [TREF].\n\nThe model is evaluated on a test set of 10,000 sentences [TREF]. The evaluation metric used is F1-score [TREF]. The F1-score is calculated for each label in the multi-label classification problem [TREF]. The average F1-score across all labels is reported as the final performance of the model [TREF].\n\nThe model is also evaluated on a separate validation set of 1,000 sentences [TREF]. The validation set is used to tune the hyperparameters of the model [TREF]. The hyperparameters t

In [29]:
res = trainer.model.generate(data_collator([test_ds[3]])['input_ids'], max_new_tokens=512)
tokenizer.decode(res[0]).split('[/INST]')[-1]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


' <BACKBROUND> Our system outperforms the R-Net baseline (Rouge-L: 40.22) used by [GREF] . </BACKBROUND><PERCEPTION> If there is a trade-off sought between computing time and accuracy, our system performs similar to or better than the baseline used by </PERCEPTION><INFORMATION> [TREF] ROUGE score is not the best metric for tasks such as opinion question answering. </INFORMATION> We believe the cosine similarity is a better metric to measure how close the retrieved answer is to the gold standard. Overall the sim method is able to provide an answer more than 70% similar to the gold standard answer 91.5% of the time. From the sentences returned by our system as candidate answers, 72% of the time at least half the candidate sentences are good answers. </PERCEPTION> This shows that our system is consistent and accurate at providing good answers.</s>'

In [31]:
tokenizer.decode([no for no in data_collator([test_ds[4]])['labels'][0] if no != -100])

''

In [12]:
trainer.train()

Step,Training Loss
500,0.2478
1000,0.1827
1500,0.1352
2000,0.1151
2500,0.0879


TrainOutput(global_step=2532, training_loss=0.15242057120630526, metrics={'train_runtime': 1477.8347, 'train_samples_per_second': 1.713, 'train_steps_per_second': 1.713, 'total_flos': 5.957338321974067e+16, 'train_loss': 0.15242057120630526, 'epoch': 3.0})

In [13]:
#evaluate the fine tuned model
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results['eval_loss'])
eval_results['perplexity'] = perplexity
print(f"Fine-tuned {model_id} Results: Perplexity: {perplexity:.2f}")

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


NameError: name 'math' is not defined

In [16]:
trainer.evaluate()

{'eval_runtime': 48.6469,
 'eval_samples_per_second': 4.337,
 'eval_steps_per_second': 4.337,
 'epoch': 3.0}