<a href="https://colab.research.google.com/github/hadar-grimberg/data-science-portfolio/blob/main/llama_finetuning_LORA_4bit_quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json
import numpy as np


# pd.set_option('display.max_columns', 5)
pd.set_option('display.width', 500)
# Import files
queries = pd.read_csv("user_queries.csv")
descriptions = pd.read_csv("fields_description.csv")


In [None]:
# drop duplicates
queries.drop_duplicates(inplace=True)
# convert into json
queries['json_formatted'] = queries['json'].apply(eval).apply(json.dumps).apply(json.loads)

# build data frame
df = pd.DataFrame()
for ind, row in queries.iterrows():
    a = pd.json_normalize(row.json_formatted, 'statements', 'entityType')
    a["question"] = row["question"]
    a["ind"] = ind
    df = pd.concat([df,a])

df.index = df.ind.values

# Check for nested jsons
j_cols=[]
for col in df.loc[:, df.dtypes == object].columns:
    if df[col].apply(str).apply(len).max() >=100 and col != "question":
        j_cols.append(col)
print(f"These columns may have nested jsons: {j_cols}")
cols = df.type[df['statements'].notnull()].unique()
print(f"These types may have nested jsons: {cols} under 'statements' column")
m = df[df.type.isin(cols) & df.statements.isnull()]
print (f"{len(m)} rows which their type is one of {cols} has empty 'statements' column")

# Normalize the nested jsons
#254


tmp = pd.json_normalize(df.statements[df['statements'].notnull()])
for i in tmp:
    if i ==0:
        sup_statements= pd.json_normalize(tmp.iloc[:,i])
    else:
        b= pd.json_normalize(tmp.iloc[:,i])
        b.columns = [coll + f"_{i}" for coll in  b.columns]
        sup_statements = pd.concat([sup_statements, b], axis=1)

# check if there is entityType or relationTargetType in columns
print (f"There are {len([i for i in sup_statements.columns if 'entity' in i.lower()])} coulmns that may contains entityType")
print (f"There are {len([i for i in sup_statements.columns if 'relation' in i.lower()])} coulmns that may contains relationTargetType")


# convert relationTargetType from list to a str
if df["parameters.relationTargetType"][df["parameters.relationTargetType"].notnull()].apply(len).max() == 1:
    df.loc[df["parameters.relationTargetType"].notnull(), "parameters.relationTargetType"] = \
    df["parameters.relationTargetType"][df["parameters.relationTargetType"].notnull()].explode()

# check if a question can have more than one entityType
df.groupby("ind")["entityType"].nunique()[df.groupby("ind")["entityType"].nunique()!=1] # none

# Check if all entityType are exsist in data
descriptions["entity_name"][~descriptions["entity_name"].isin(df['entityType'])] # none


# Check in which cases we have relationTargetType
print(f"Rows which their type is {df.type[df['parameters.relationTargetType'].notnull()].unique()} have relationTargetType information")


# check if all entities in relationTargetType column are in entityType column
np.in1d(df["parameters.relationTargetType"][df["parameters.relationTargetType"].notnull()].unique(), df.entityType.unique())

print(f"There are total of {len(df.entityType.unique())} entities")

# check which entityTypes have relationTargetType
ent_rel = df.entityType[df['parameters.relationTargetType'].notnull()].unique()
print(f"entityTypes {ent_rel} have relationTargetType information")

# check what are the entityTypes to each relationTargetType
rel_ent = df["parameters.relationTargetType"][df['parameters.relationTargetType'].notnull()].unique()
print(f"relationTargetType {rel_ent} have entityType information")

# There is a direct relation between entityType to relationTargetType
for ent in ent_rel:
    print(f"When entityType is {ent}, relationTargetType is {df['parameters.relationTargetType'][(df.entityType==ent)& (df['parameters.relationTargetType'].notnull())].unique()}")
for rel in rel_ent:
    print(f"When relationTargetType is {rel}, entityType is {df.entityType[df['parameters.relationTargetType']==rel].unique()}")

# the couples of entityType & relationTargetType are either CDR & phone or 'Web Activity' & 'Web Actor'


These columns may have nested jsons: ['statements', 'parameters.statements']
These types may have nested jsons: ['operator' 'relation'] under 'statements' column
1 rows which their type is one of ['operator' 'relation'] has empty 'statements' column
There are 0 coulmns that may contains entityType
There are 0 coulmns that may contains relationTargetType
Rows which their type is ['relation'] have relationTargetType information
There are total of 9 entities
entityTypes ['CDR' 'Web Activity' 'Web Actor' 'Phone'] have relationTargetType information
relationTargetType ['Phone' 'Web Actor' 'Web Activity' 'CDR'] have entityType information
When entityType is CDR, relationTargetType is ['Phone']
When entityType is Web Activity, relationTargetType is ['Web Actor']
When entityType is Web Actor, relationTargetType is ['Web Activity']
When entityType is Phone, relationTargetType is ['CDR']
When relationTargetType is Phone, entityType is ['CDR']
When relationTargetType is Web Actor, entityType is [

In [None]:
# prepare dataset
print(len(df[df["parameters.relationTargetType"].notnull()]))
dataset = df[["question", "entityType","parameters.relationTargetType"]].sort_values(by=["question","parameters.relationTargetType"]).drop_duplicates(subset=["question","entityType"])
dataset["y"] = dataset[["entityType","parameters.relationTargetType"]].apply(lambda x: [i for i in x.fillna(0) if i!=0], axis = 1)
dataset.columns = ['question', 'entityType', 'relationTargetType', 'y']
dataset.sort_index(inplace=True)
print(len(dataset[dataset["relationTargetType"].notnull()]))
print(len(dataset))
print(dataset.head(50))

112
112
742
                                             question     entityType relationTargetType                          y
0            Find all calls made using 3G technology.            CDR                NaN                      [CDR]
1   List all Reddit comments posted yesterday with...   Web Activity                NaN             [Web Activity]
2   Show me investigations that are either open or...  Investigation                NaN            [Investigation]
3   Find all insights related to the witness Jane ...        Insight                NaN                  [Insight]
4   List all web activities updated in the last da...   Web Activity                NaN             [Web Activity]
5   What phones were last active earlier than 1 De...          Phone                NaN                    [Phone]
6        Which phones have been marked as suspicious?          Phone                NaN                    [Phone]
7   What failed call attempts were made from targe...            CDR

In [None]:
### Add NER in order to improve training
# 1. check if entityType is in question
df["entity_in_text"] = df[["entityType", "question"]].apply(lambda x: x.entityType.lower() in x.question.lower(), axis=1)
tmp = df[["entityType", "question", "entity_in_text"]].drop_duplicates().groupby("entityType")["entity_in_text"].apply(lambda x: x.sum()/len(x))
print(tmp)
ent_values = tmp[tmp>=0.85].index.values
df.loc[(df.entityType.isin(ent_values)) & (df.entity_in_text==True), "NER"] = df.entityType[(df.entityType.isin(ent_values)) & (df.entity_in_text==True)]

df.NER = df.NER.apply(lambda x: x.upper() if isinstance(x,str) else x)

entityType
CDR              0.026786
EVisa Request    0.000000
Insight          1.000000
Investigation    0.890411
Person           0.173077
Phone            0.977528
Report           1.000000
Web Activity     0.000000
Web Actor        0.142857
Name: entity_in_text, dtype: float64


In [None]:
# 2. Check if parameters.value in question

df["value_in_text"] = df[["parameters.value", "question"]].apply(lambda x: x["parameters.value"].lower() in x.question.lower() if ((isinstance(x["parameters.value"], str) & (x["parameters.value"]!=""))) else False, axis=1)
df[["entityType", "parameters.operator", "value_in_text"]].groupby(["entityType","parameters.operator"])["value_in_text"].apply(lambda x: x.sum()/len(x))
tmp = df[["entityType", "parameters.operator", "value_in_text"]].groupby(["parameters.operator"])["value_in_text"].apply(lambda x: x.sum()/len(x))
print(tmp)
opr_values = tmp[tmp>=0.7].index.values

df.loc[(df["parameters.operator"].isin(opr_values)) & (df.value_in_text==True), "NER2"] = df["parameters.name"][(df["parameters.operator"].isin(opr_values)) & (df.value_in_text==True)].apply(lambda x: x.split(".")[-1].upper() if len(x.split("."))>1 else np.nan)


parameters.operator
after           0.000000
before          0.000000
begins_with     0.937500
between         0.000000
contains        0.995516
equals          0.712034
greater         0.000000
is              0.020408
is_not_empty    0.000000
not_equal       0.923077
relative        0.000000
similar_to      1.000000
Name: value_in_text, dtype: float64


In [None]:
#3. Create NER table

ner_df = pd.DataFrame(columns=["NER", "words"])
for indd in df.index.unique():
    if isinstance(df.loc[indd], pd.DataFrame):
        ner1 = list(set(df.loc[indd,"NER"][df.loc[indd,"entity_in_text"]==True]))
        if len(ner1)>0:
            if not isinstance(ner1[0], str):
                ner1=[]
        ner = ner1 + [a for a in df.loc[indd, "NER2"].values.tolist() if isinstance(a, str)]
        words = ner1+ list(df.loc[indd,"parameters.value"][df.loc[indd,"value_in_text"]==True])
    else:
        if (df.loc[indd, "entity_in_text"] == True) & (isinstance(df.loc[indd,"NER"], str)):
            ner = [df.loc[indd,"NER"]]
            words = [df.loc[indd,"NER"]]
            if isinstance(df.loc[indd,"NER2"], str):
                ner.append(df.loc[indd,"NER2"])
                words.append(df.loc[indd,"parameters.value"])
        elif isinstance(df.loc[indd, "NER2"], str):
                ner = [df.loc[indd, "NER2"]]
                words = [df.loc[indd, "parameters.value"]]
        else:
            ner=[]
            words=[]

    ner_df.loc[indd, "NER"] = ner
    ner_df.loc[indd, "words"] = words


In [None]:
# Unite ner with dataset and append locations in text

dataset = dataset.join(ner_df)
dataset.NER = dataset.NER.apply(lambda x: [] if (x == [""]) else x)
dataset.words = dataset.words.apply(lambda x: [] if (x == [""]) else x)
dataset["start"] = dataset.apply(lambda x: [x.question.lower().find(w.lower()) for w in x.words if len(x.words)>0],  axis=1)
dataset["end"] = dataset.apply(lambda x: [x.start[i] + len(w)-1 for i, w in enumerate(x.words) if len(x.words)>0],  axis=1)
dataset.columns = ['query', 'entity_type', 'entity_type2', 'y', 'attribute_type', 'attribute_value', 'start_position', 'end_position']


In [None]:
#Split into train and test
msk = np.random.rand(len(dataset)) < 0.9
train_df = dataset[msk]
test_df = dataset[~msk]
print(f"There are {len(train_df[['entity_type', 'entity_type2']].drop_duplicates())} diferent entities in train dataset")
print(f"There are {len(test_df[['entity_type', 'entity_type2']].drop_duplicates())} diferent entities in test dataset")

There are 13 diferent entities in train dataset
There are 12 diferent entities in test dataset


In [None]:
def prepare_json(dataset):
  js= []
  for i, row in dataset.iterrows():
      query_js = {"query": row["query"],
      "entity_type": row["entity_type"]}
      if isinstance(row["entity_type2"], str):
          query_js["entity_type2"] = row["entity_type2"]
      if len(row["attribute_type"])>0:
          query_js["attributes"] = []
          for i in range(len(row["attribute_type"])):
              query_js["attributes"].append({"attribute_type": row["attribute_type"][i],
                                            "attribute_value": row["attribute_value"][i],
                                            "start_position": row["start_position"][i],
                                            "end_position": row["end_position"][i]})

      js.append(query_js)
  return js

In [None]:
train_dataset = prepare_json(train_df)
eval_dataset = prepare_json(test_df)

In [None]:
str(train_dataset[411])


'{\'query\': "Show posts about \'politics\' published by actors who studied at Harvard University.", \'entity_type\': \'Web Activity\', \'entity_type2\': \'Web Actor\', \'attributes\': [{\'attribute_type\': \'TEXT\', \'attribute_value\': \'politics\', \'start_position\': 18, \'end_position\': 25}]}'

In [None]:
# # !pip install -q -U transformers accelerate optimum
# # !pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/
# # !pip install langchain
# # !pip install einops
!pip install peft
!pip install trl
!pip install -U bitsandbytes

Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.0-py3-none-any.whl (322 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.5/322.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.0
Collecting trl
  Downloading trl-0.11.1-py3-none-any.whl.metadata (12 kB)
Collecting datasets (from trl)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.11-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Collecting pyarrow>=15.0.0 (from datasets->trl)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->trl)
  Downloading xxhash

In [None]:
from huggingface_hub import login
from google.colab import userdata

login(userdata.get('huggingFace'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import prepare_model_for_kbit_training, LoraConfig
from trl import setup_chat_format


model_id = "meta-llama/Meta-Llama-3-8B"
# model_id = "mistralai/Mistral-7B-v0.1"

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'
tokenizer.model_max_length = 2048

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Model setup
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device_map,
    quantization_config=bnb_config
)

model, tokenizer = setup_chat_format(model, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [None]:
#Entities description
entis = '''CDR: Question is about communication of any kind like None-Call, Text,Voice, VoiceEdited, Web or Email.\n
           Web Activity: Question is aboutposts and interactions with post thru social media.\n
           Investigation: Question is about investigation.\n
           Insight: Question is about insight.\n
           Phone: Question is about IMEI, IMSI or MSISDN identifier of a phone.\n
           Report: Question is about retrieve information about a reports.\n
           Person: Question is about retrieve information about a people.\n
           Web Actor: Question is about the preformer of a web activity.\n
           EVisa Request: Question is about EVisa request or travel history.
           '''

In [None]:
def get_fine_tune_prompt(
    ents_str: str,
    input_str: str,
    atts_str: str,
    label_str: list,
    tokenizer: AutoTokenizer,
) -> torch.Tensor:

    """
    Args:
    ents_str (str): Strings representing entity labels and its
                          corresponding description
    input_str (str): Actual input query string on which calssification need to be
                     performed
    atts_str (str):  Input attributes stringcorresponding to input_str
    label_str (list): Expected output string or strings corresponding to input_str
    tokenizer (PreTrainedTokenizerBase): A tokenizer corresponding to the model
                                         being fine-tuned

    Returns:
    torch.Tensor: Tensor of tokenized input ids
    """

    usr_msg1 = "You are given a user queries about a certain entity. " \
        "You are also given a list of entity types representing types of the content world of the question. " \
        "Your goal is to predict which entities are mentioned or implied by the user’s query." \
        "Note, some queries belong to one entity and some to two entities" \
        "in order to assist you, attributes from the query will be provided in a json format." \
        "The JSON keys: 'attribute_type' (label of the detected attribute), 'attribute_value' (actual str" \
        " value of the attribute), 'start_position' (start character index), 'end_position' (end character index). "\
        "Not all queries contain attributes. Sometimes you'll get an empty string" \
        "The output must be a list with one or two entities. Do not perform false identifications." \
        f"""\n\nList Of Entities\n{ents_str}"""\
        "\n\n" \
        "Are the instructions clear to you?"

    asst_msg1 = "Yes, the instructions are clear. First, I will identify the attributes, " \
        "in order to predict which entities are mentioned or implied by the user’s query." \
        "In the end I will return a list with one or two entities"

    usr_msg2 = "What SMS messages were sent from suspicious phones to 0549876543 containing the word 'urgent'?"

    asst_msg2 = " [{ "\
                "\"attribute_type\": \"msisdn2\", "\
                "\"attribute_value\": \"0549876543\", "\
                "\"start_position\": 51, "\
                "\"end_position:\": 61 }, "\
                "{\"attribute_type\": \"smsText\", "\
                "\"attribute_value\": \"urgent\", "\
                "\"start_position\": 86, "\
                "\"end_position:\": 91 } ] "

    usr_msg3 = "Good! Now predict the entity or entities based on these attributes"

    asst_msg3 = "[\"CDR\", \"Phone\"]"


    usr_msg4 = "Give a brief explanation of why your answer is correct."

    asst_msg4 = "I identified and some attributes such as \"0549876543\" which is \"msisdn2\", " \
                "The word \"urgent\" which is \"smsText\". " \
                "According to the querty and its attributes I predict the entities \"CDR\", & \"Phone\" "\
                "I am going to return a list with two items: [\"CDR\", \"Phone\"] "


    usr_msg5 = "Great! I am now going to give you another user query. Please detect sub-entities and entities in it " \
        "according to the previous instructions. Do not include an explanation in your answer."


    asst_msg5 = "Sure! Please give me the user query."

    messages = [
        {"role": "user", "content": usr_msg1},
        {"role": "assistant", "content": asst_msg1},
        {"role": "user", "content": usr_msg2},
        {"role": "assistant", "content": asst_msg2},
        {"role": "user", "content": usr_msg3},
        {"role": "assistant", "content": asst_msg3},
        {"role": "user", "content": usr_msg4},
        {"role": "assistant", "content": asst_msg4},
        {"role": "user", "content": usr_msg5},
        {"role": "assistant", "content": asst_msg5},
        {"role": "user", "content": input_str},
        {"role": "assistant", "content": atts_str},
        {"role": "user", "content": usr_msg3},
        {"role": "assistant", "content": str(label_str)},
    ]

    encoded_input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False)

    return encoded_input_ids

In [None]:
training = [get_fine_tune_prompt(entis, input["query"], str(input["attributes"]) if "attributes" in input.keys() else "", [v for k,v in input.items() if k.startswith("entity_type")], tokenizer) for input in train_dataset]

In [None]:
training[0]

[{'role': 'user',
  'content': "You are given a user queries about a certain entity. You are also given a list of entity types representing types of the content world of the question. Your goal is to predict which entities are mentioned or implied by the user’s query.Note, some queries belong to one entity and some to two entitiesin order to assist you, attributes from the query will be provided in a json format.The JSON keys: 'attribute_type' (label of the detected attribute), 'attribute_value' (actual str value of the attribute), 'start_position' (start character index), 'end_position' (end character index). Not all queries contain attributes. Sometimes you'll get an empty stringThe output must be a list with one or two entities. Do not perform false identifications.\n\nList Of Entities\nCDR: Question is about communication of any kind like None-Call, Text,Voice, VoiceEdited, Web or Email.\n\n           Web Activity: Question is aboutposts and interactions with post thru social media

In [None]:
training[0]

'<|im_start|>user\nYou are given a user queries about a certain entity. You are also given a list of entity types representing types of the content world of the question. Your goal is to predict which entities are mentioned or implied by the user’s query.Note, some queries belong to one entity and some to two entitiesin order to assist you, attributes from the query will be provided in a json format.The JSON keys: \'attribute_type\' (label of the detected attribute), \'attribute_value\' (actual str value of the attribute), \'start_position\' (start character index), \'end_position\' (end character index). Not all queries contain attributes. Sometimes you\'ll get an empty stringThe output must be a list with one or two entities. Do not perform false identifications.\n\nList Of Entities\nCDR: Question is about communication of any kind like None-Call, Text,Voice, VoiceEdited, Web or Email.\n\n           Web Activity: Question is aboutposts and interactions with post thru social media.\n\

In [None]:
a = pd.DataFrame(training, columns=["text"])

In [None]:
import datasets
a = datasets.Dataset.from_pandas(a)

In [None]:
a

Dataset({
    features: ['text'],
    num_rows: 676
})

In [None]:
model = prepare_model_for_kbit_training(model)

# LoRA configuration

peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=256,
    bias="none",
    target_modules=["q_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "k_proj", "v_proj"],
    task_type="CAUSAL_LM",)

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="sft_model_path",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="adamw_8bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to="tensorboard",)

In [None]:
from dataclasses import dataclass
from transformers.utils import PaddingStrategy
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union

@dataclass
class CustomDataCollatorWithPadding:
    """
    Data collator that will dynamically pad the inputs received.

    Args:
        tokenizer AutoTokenizer:
            The tokenizer used for encoding the data.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:

            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
              sequence is provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.

            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        return_tensors (`str`, *optional*, defaults to `"pt"`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    tokenizer: AutoTokenizer
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        labels = batch["input_ids"].clone()

        # Set loss mask for all pad tokens
        labels[labels == self.tokenizer.pad_token_id] = -100

        # Compute loss mask for appropriate tokens only
        for i in range(batch['input_ids'].shape[0]):

            # Decode the training input
            text_content = self.tokenizer.decode(batch['input_ids'][i][1:])  # slicing from [1:] is important because tokenizer adds bos token

            # Extract substrings for prompt text in the training input
            # The training input ends at the one before last assistent msg starts in "assistant\n"
            prompt_gen_boundary = text_content.rfind("assistant", 0, text_content.rfind("assistant")) + len("assistant")+2 #include the '\n' as well
            prompt_text = text_content[:prompt_gen_boundary]

            # print(f"""PROMPT TEXT:\n{prompt_text}""")

            # retokenize the prompt text only
            prompt_text_tokenized = self.tokenizer(
                prompt_text,
                return_overflowing_tokens=False,
                return_length=False,
            )
            # compute index where prompt text ends in the training input
            prompt_tok_idx = len(prompt_text_tokenized['input_ids'])

            # Set loss mask for all tokens in prompt text
            labels[i][range(prompt_tok_idx)] = -100

            # print("================DEBUGGING INFORMATION===============")
            # for idx, tok in enumerate(labels[i]):
            #     token_id = batch['input_ids'][i][idx]
            #     decoded_token_id = self.tokenizer.decode(batch['input_ids'][i][idx])
            #     print(f"""TOKID: {token_id} | LABEL: {tok} || DECODED: {decoded_token_id}""")

        batch["labels"] = labels
        return batch

In [None]:
from trl import SFTTrainer
# from transformers import DataCollatorWithPadding


trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=a,
    peft_config=peft_config,
    max_seq_length=512,
    tokenizer=tokenizer,
    dataset_text_field="text",
    packing=False,
    # data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    },
      data_collator=CustomDataCollatorWithPadding(
      tokenizer=tokenizer,
      padding="longest",
      max_length=512,
      return_tensors="pt")
)
trainer.train()
trainer.save_model()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/676 [00:00<?, ? examples/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,1.6542
20,0.0136
30,0.0348


Step,Training Loss
10,1.6542
20,0.0136
30,0.0348
40,0.0001
50,0.0
60,0.0
70,0.0
80,0.0




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r  "/content/sft_model_path" "/content/drive/MyDrive"
# !cp -r  "/content/drive/MyDrive/sft_model_path" "/content"


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import prepare_model_for_kbit_training, LoraConfig
from trl import setup_chat_format

In [None]:
# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_fast=True, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'
tokenizer.model_max_length = 2048

In [None]:
# Load the model trained weights
from peft import PeftModel

base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "/content/sft_model_path/checkpoint-84"

base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()

model.save_pretrained("llama-3-8b-NER")
tokenizer.save_pretrained("llama-3-8b-NER")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

('llama-3-8b-NER/tokenizer_config.json',
 'llama-3-8b-NER/special_tokens_map.json',
 'llama-3-8b-NER/tokenizer.json')

In [None]:
def get_test_prompt(
    ents_str: str,
    input_str: str,
    tokenizer: AutoTokenizer,
) -> torch.Tensor:

    """
    Args:
    ents_str (str): Strings representing entity labels and its
                          corresponding description
    input_str (str): Actual input query string on which calssification need to be
                     performed
    atts_str (str):  Input attributes stringcorresponding to input_str
    label_str (list): Expected output string or strings corresponding to input_str
    tokenizer (PreTrainedTokenizerBase): A tokenizer corresponding to the model
                                         being fine-tuned

    Returns:
    torch.Tensor: Tensor of tokenized input ids
    """

    usr_msg1 = "You are given a user queries about a certain entity. " \
        "You are also given a list of entity types representing types of the content world of the question. " \
        "Your goal is to predict which entities are mentioned or implied by the user’s query." \
        "Note, some queries belong to one entity and some to two entities" \
        "in order to assist you, attributes from the query will be provided in a json format." \
        "The JSON keys: 'attribute_type' (label of the detected attribute), 'attribute_value' (actual str" \
        " value of the attribute), 'start_position' (start character index), 'end_position' (end character index). "\
        "Not all queries contain attributes. Sometimes you'll get an empty string" \
        "The output must be a list with one or two entities. Do not perform false identifications." \
        f"""\n\nList Of Entities\n{ents_str}"""\
        "\n\n" \
        "Are the instructions clear to you?"

    asst_msg1 = "Yes, the instructions are clear. First, I will identify the attributes, " \
        "in order to predict which entities are mentioned or implied by the user’s query." \
        "In the end I will return a list with one or two entities"

    usr_msg2 = "What SMS messages were sent from suspicious phones to 0549876543 containing the word 'urgent'?"

    asst_msg2 = " [{ "\
                "\"attribute_type\": \"msisdn2\", "\
                "\"attribute_value\": \"0549876543\", "\
                "\"start_position\": 51, "\
                "\"end_position:\": 61 }, "\
                "{\"attribute_type\": \"smsText\", "\
                "\"attribute_value\": \"urgent\", "\
                "\"start_position\": 86, "\
                "\"end_position:\": 91 } ] "

    usr_msg3 = "Good! Now predict the entity or entities based on these attributes"

    asst_msg3 = "[\"CDR\", \"Phone\"]"


    usr_msg4 = "Give a brief explanation of why your answer is correct."

    asst_msg4 = "I identified and some attributes such as \"0549876543\" which is \"msisdn2\", " \
                "The word \"urgent\" which is \"smsText\". " \
                "According to the querty and its attributes I predict the entities \"CDR\", & \"Phone\" "\
                "I am going to return a list with two items: [\"CDR\", \"Phone\"] "


    usr_msg5 = "Great! I am now going to give you another user query. Please detect sub-entities and entities in it " \
        "according to the previous instructions. Do not include an explanation in your answer."


    asst_msg5 = "Sure! Please give me the user query."

    messages = [
        {"role": "user", "content": usr_msg1},
        {"role": "assistant", "content": asst_msg1},
        {"role": "user", "content": usr_msg2},
        {"role": "assistant", "content": asst_msg2},
        {"role": "user", "content": usr_msg3},
        {"role": "assistant", "content": asst_msg3},
        {"role": "user", "content": usr_msg4},
        {"role": "assistant", "content": asst_msg4},
        {"role": "user", "content": usr_msg5},
        {"role": "assistant", "content": asst_msg5},
        {"role": "user", "content": input_str},
    ]

    encoded_input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=False)

    return encoded_input_ids

In [None]:
testing = [get_test_prompt(entis, input["query"], tokenizer) for input in eval_dataset]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device="cuda"
)

# outputs = pipe(testing[1], max_new_tokens=120, temperature=0.7, top_k=50, top_p=0.95)
outputs = [pipe(prompt, max_new_tokens=120, temperature=0.7, top_k=50, top_p=0.95) for prompt in testing]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
  r = outputs[1][0]['generated_text'].rfind("[")
  l = outputs[1][0]["generated_text"].rfind("]", 0, outputs[1][0]["generated_text"].rfind("]"))
  outputs[1][0]["generated_text"][r:]

'["CDR", "Phone"] ご\nuser\nGive a brief explanation of why your answer is'

In [None]:
predicted_entities = []
for message in outputs:
  r = message[0]["generated_text"].rfind("[")
  l = message[0]["generated_text"].rfind("]")
  predicted_entities.append(message[0]["generated_text"][r:l+1])


In [None]:
eval_dataset

NameError: name 'eval_dataset' is not defined

In [None]:
predicted_entities

['["Web Activity", "Web Actor"]',
 '["CDR", "Phone"]',
 '["Report", "CDR"]',
 '',
 '',
 '["CDR", "Phone", "Investigation"]',
 '["CDR", "Phone"]',
 '',
 '["Report", "Web Activity" ]',
 '["CDR", "Phone"]',
 '["CDR", "Phone"]',
 '["CDR", "Phone"]',
 '["CDR", "Phone"]',
 '["Web Activity", "CDR"]',
 '["Web Actor", "Web Activity"]',
 '["CDR", "Phone"]',
 '["CDR", "Phone", "EVisa Request"]',
 '["CDR", "Phone"]',
 '["Phone", "CDR", "Investigation", "Insight"]',
 '',
 '["CDR", "Phone"] ://, "Phone"]',
 '["CDR", "Phone"]',
 '["Phone", "CDR"]',
 '["CDR", "Phone"]',
 '["Web Activity", "Web Actor"]',
 '',
 '["Report", "CDR"]',
 '["CDR", "Phone"]',
 '["Web Activity", "CDR", "Phone"]',
 '["EVisa Request", "Phone"]',
 '["CDR", "Phone"]',
 '["Phone", "CDR", "Web Activity", "Web Actor"]',
 '["Phone", "CDR", "Phone"]',
 '["CDR", "Web Actor", "Web Activity", "Phone"]',
 '["CDR", "Phone", "Web", "Web Actor", "EVisa Request" ]',
 '["Report", "Web"]',
 '["Investigation", "Report"]',
 '["CDR", "Investigation"

In [None]:
outputs

[{'generated_text': '<|im_start|>user\nYou are given a user queries about a certain entity. You are also given a list of entity types representing types of the content world of the question. Your goal is to predict which entities are mentioned or implied by the user’s query.Note, some queries belong to one entity and some to two entitiesin order to assist you, attributes from the query will be provided in a json format.The JSON keys: \'attribute_type\' (label of the detected attribute), \'attribute_value\' (actual str value of the attribute), \'start_position\' (start character index), \'end_position\' (end character index). Not all queries contain attributes. Sometimes you\'ll get an empty stringThe output must be a list with one or two entities. Do not perform false identifications.\n\nList Of Entities\nCDR: Question is about communication of any kind like None-Call, Text,Voice, VoiceEdited, Web or Email.\n\n           Web Activity: Question is aboutposts and interactions with post t

In [None]:
testing[1]

'<|im_start|>user\nYou are given a user queries about a certain entity. You are also given a list of entity types representing types of the content world of the question. Your goal is to predict which entities are mentioned or implied by the user’s query.Note, some queries belong to one entity and some to two entitiesin order to assist you, sub-entities and attributes from the query will be providedin a json format. The JSON keys: \'query\', \'attribute_type\' (label of the detected sub-entity), \'attribute_value\' (actual str value of the sub-entity), \'start_position\' (start character index), \'end_position\' (end character index), \'entity_type\' (type of the entity that should be classify) and \'entity_type2\' (type of the entity that should be classify in case of two entities). The output must be a list with one or two entities. Do not perform false identifications.\n\nList Of Entities\nCDR: Question is about communication of any kind like None-Call, Text,Voice, VoiceEdited, Web 

In [None]:
outputs = pipe(testing[1], max_new_tokens=30, temperature=0.9, top_k=50, top_p=0.95)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
outputs

[{'generated_text': '<|im_start|>user\nYou are given a user queries about a certain entity. You are also given a list of entity types representing types of the content world of the question. Your goal is to predict which entities are mentioned or implied by the user’s query.Note, some queries belong to one entity and some to two entitiesin order to assist you, sub-entities and attributes from the query will be providedin a json format. The JSON keys: \'query\', \'attribute_type\' (label of the detected sub-entity), \'attribute_value\' (actual str value of the sub-entity), \'start_position\' (start character index), \'end_position\' (end character index), \'entity_type\' (type of the entity that should be classify) and \'entity_type2\' (type of the entity that should be classify in case of two entities). The output must be a list with one or two entities. Do not perform false identifications.\n\nList Of Entities\nCDR: Question is about communication of any kind like None-Call, Text,Voic

In [None]:
outputs

[{'generated_text': '<|im_start|>user\nYou are given a user queries about a certain entity. You are also given a list of entity types representing types of the content world of the question. Your goal is to predict which entities are mentioned or implied by the user’s query.Note, some queries belong to one entity and some to two entitiesin order to assist you, sub-entities and attributes from the query will be providedin a json format. The JSON keys: \'query\', \'attribute_type\' (label of the detected sub-entity), \'attribute_value\' (actual str value of the sub-entity), \'start_position\' (start character index), \'end_position\' (end character index), \'entity_type\' (type of the entity that should be classify) and \'entity_type2\' (type of the entity that should be classify in case of two entities). The output must be a list with one or two entities. Do not perform false identifications.\n\nList Of Entities\nCDR: Question is about communication of any kind like None-Call, Text,Voic

In [None]:
# Execute predictions
testing = [get_fine_tune_prompt(entis, str(input), [v for k,v in input.items() if k.startswith("entity_type")], tokenizer) for input in eval_dataset]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = [pipe(prompt, max_new_tokens=120, temperature=0.7, top_k=50, top_p=0.95) for prompt in testing]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


KeyboardInterrupt: 

In [None]:
outputs