## Downloading the required libraries

In [1]:
!pip install transformers==4.57.1 bitsandbytes==0.48.1 peft==0.17.1 trl==0.24.0 datasets==4.0.0 huggingface_hub==0.35.3

Collecting bitsandbytes==0.48.1
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl==0.24.0
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, trl
Successfully installed bitsandbytes-0.48.1 trl-0.24.0


In [2]:
import re,math, pandas as pd
from datasets import Dataset,DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig,get_peft_model
from huggingface_hub import snapshot_download,login
import torch
import json
import numpy as np
import random

Importing our data and converting it in the dataframe ,so we can preprocess effectively using apply method in pandas

In [3]:
from transformers import set_seed
seed = 52
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

train_path = "/content/train.jsonl"
val_path = "/content/dev.jsonl"
test_path = "/content/test.jsonl"

train = pd.read_json(train_path, lines=True)
val = pd.read_json(val_path, lines=True)
test = pd.read_json(test_path, lines=True)
train.sample(5)

Unnamed: 0,input,output
94,"Stone pelting reported in Jamshedpur, lanes ja...","{'event_type': 'riot', 'when': 'tomorrow', 'wh..."
205,"Night curfew कल दोपहर in Gurugram, police brie...","{'event_type': 'curfew', 'when': 'tomorrow', '..."
170,Roadshow / jalsa by Party A at Guwahati tomoro...,"{'event_type': 'rally', 'when': 'tomorrow', 'w..."
89,Section 144 imposed in Jamshedpur Saturday 10a...,"{'event_type': 'curfew', 'when': 'D+4', 'where..."
119,Communal tension flared in Mangaluru आज दोपहर ...,"{'event_type': 'riot', 'when': 'today', 'where..."


Using regex for text proprocessing

In [4]:
import re

def preprocess(text):
  text = re.sub(r'\s+',' ',text)
  text = re.sub(r'[\U0001F600-\U0001F64F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F700-\U0001F77F|\U0001F780-\U0001F7FF]',' ',text)
  text = text.replace(r'\n',' ')
  text = text.replace(r'\t',' ')
  text = re.sub(r'#+','',text)
  text = re.sub(r'http\S+|@\w+','',text)
  text = re.sub(r'\s+', ' ', text)
  text = text.strip()

  return text


Applying the preprocessing

In [5]:
train['input'] = train['input'].apply(preprocess)
val['input'] = val['input'].apply(preprocess)
test['input'] = test['input'].apply(preprocess)

In [6]:
login(token="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")

In [7]:
model_name =  'meta-llama/Llama-3.2-1B'

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

General Autotokenizer was causing problem trying to import "additional chat template" so used the snapshot_download to only download the required files

In [8]:
local_dir = snapshot_download(
  repo_id=model_name,
  revision='main',
  allow_patterns=[
      'tokenizer*',
      'vocab',
      '*.model',
      'special_tokens_map.json',
      'tokenizer_config.json',
      'tokenizer.json'
  ]

)

tokenizer = AutoTokenizer.from_pretrained(
    local_dir,
    device_map='auto',
    local_files_only=True,
    padding_side="right",
    use_fast=True,

)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

original/tokenizer.model:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Converting it into the Dict form so we easily make our chat template

In [9]:
trainds = Dataset.from_pandas(train, preserve_index=False)
valds = Dataset.from_pandas(val, preserve_index=False)
testds = Dataset.from_pandas(test, preserve_index=False)

dataset = DatasetDict({
    "train": trainds,
    "val": valds,
    "test": testds
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 210
    })
    val: Dataset({
        features: ['input', 'output'],
        num_rows: 45
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 45
    })
})


Since we are using the base model so it does not have the chattemplate pre bulit so we need to make it for your usecase

In [10]:
tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'system' %}<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{{ message['content'] }}<|eot_id|>{% elif message['role'] == 'user' %}<|start_header_id|>user<|end_header_id|>
{{ message['content'] }}<|eot_id|>{% elif message['role'] == 'assistant' %}<|start_header_id|>assistant<|end_header_id|>
{{ message['content'] }}<|eot_id|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|start_header_id|>assistant<|end_header_id|>
{% endif %}"""

special_tokens = {
    'pad_token' : "<|pad|>",
    'additional_special_tokens': [
        '<|begin_of_text|>',
        '<|start_header_id|>',
        '<|end_header_id|>',
        '<|eot_id|>',
    ]
}

num_tokenizers_added = tokenizer.add_special_tokens(special_tokens)
eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer)) # this one for the model so it will not throw error cause we have increased the size of our vocabulary
response_template = "<|start_header_id|>assistant<|end_header_id|>\n"
def chat_template(row):

    completion = json.dumps(row["output"], ensure_ascii=False)
    messages = [
        {"role": "system", "content": "You are an JSON analyst. Always respond with valid JSON only."},
        {"role": "user", "content": row["input"]},
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return {"prompt": prompt, "completion": completion}

dataset = dataset.map(chat_template)
dataset

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'prompt', 'completion'],
        num_rows: 210
    })
    val: Dataset({
        features: ['input', 'output', 'prompt', 'completion'],
        num_rows: 45
    })
    test: Dataset({
        features: ['input', 'output', 'prompt', 'completion'],
        num_rows: 45
    })
})

In [11]:
lora_config = LoraConfig(
    r=128,
    lora_alpha=64,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj", "up_proj"],
    lora_dropout=0.07,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model,lora_config)
model.print_trainable_parameters()

trainable params: 69,206,016 || all params: 1,305,022,464 || trainable%: 5.3031


In [12]:
sft_config = SFTConfig(
    output_dir="./finetuned",
    num_train_epochs=3,
    max_steps=80,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_steps=10,
    eval_strategy="steps",
    save_steps=20,
    save_strategy='steps',
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_steps=2,
    fp16=True,
    gradient_accumulation_steps=4,
    logging_steps=10,
    report_to="none",
    optim="adamw_8bit",
    packing=False,
    max_length=512,
    completion_only_loss=True, # this will calculate loss on the completion only that is our output (that is in json)
)

In [13]:
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    processing_class=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
)

Adding EOS to train dataset:   0%|          | 0/210 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/210 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/210 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/45 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/45 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/45 [00:00<?, ? examples/s]

In [14]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128256}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
10,2.6638,2.012906,3.867433,5597.0,0.641875
20,1.5082,1.030393,3.062369,11182.0,0.811275
30,0.7692,0.541625,2.455241,16678.0,0.91188
40,0.4873,0.387205,2.293491,22318.0,0.928034
50,0.3165,0.324275,2.163117,27862.0,0.938779
60,0.3306,0.280536,2.141678,33236.0,0.946298
70,0.236,0.254719,2.082864,38812.0,0.948046
80,0.2343,0.236215,2.032813,44370.0,0.94832
90,0.199,0.216935,2.03042,49966.0,0.94971
100,0.1742,0.206061,2.000165,55482.0,0.953778




TrainOutput(global_step=120, training_loss=0.6048709452152252, metrics={'train_runtime': 444.8351, 'train_samples_per_second': 1.079, 'train_steps_per_second': 0.27, 'total_flos': 415173021990912.0, 'train_loss': 0.6048709452152252, 'epoch': 2.2666666666666666})

#Mode Evaluation
Creating Prediction.json for the model evaluation

In [15]:
def extract_json(text):
    text = text.strip()
    idx = text.find('{')
    if idx == -1:
        return None

    count = 0
    in_string = False
    esc = False

    for i in range(idx, len(text)):
        char = text[i]
        if char == '"' and not esc:
            in_string = not in_string
        elif char == '\\' and in_string:
            esc = not esc
            continue
        else :
          esc = False

        if not in_string:
            if char == '{':
                count += 1
            elif char == '}':
                count -= 1
                if count == 0:
                    json_str = text[idx:i+1]
                    return json.loads(json_str)

    return None

output_data = []

for idx,row in enumerate(dataset['test']):
    if idx == 12:
      break

    text = preprocess(row["input"])

    messages = [
        {"role": "system", "content": "You are an JSON analyst. Always respond with valid JSON only."},
        {"role": "user", "content": text}
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    eos_token_id = tokenizer.eos_token_id
    eot_token_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.1,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=[eos_token_id, eot_token_id],
            repetition_penalty=1.2,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    try :
      json_output = extract_json(response)
    except json.JSONDecodeError as e:
      pass

    output_data.append({
      "input": row["input"],
      "output": json_output
    })
    print(json_output)

with open('predictions.json', 'w') as f:
    json.dump(output_data, f, indent=2)

print("Predictions saved")


{'actors': ['Local Police'], 'confidence': 0.87, 'event_type': 'threat', 'priority': 3, 'rationale': "Post mentions blast threat by Local Police in Srinagar. Time phrase maps to 'D+2', so label is threat", 'when': 'D+2', 'where': 'Srinagar'}
{'actors': ['Student Union'], 'confidence': 0.87, 'event_type': 'protest', 'priority': 3, 'rationale': "Post mentions sit-in dharna by Student Union in neighbourhood of mahalla bye pass road in Sector-5. Time phrase maps to 'today', so label is protest", 'when': 'today', 'where': 'Sector-5'}
{'actors': ['District Admin'], 'confidence': 0.87, 'event_type': 'curfew', 'priority': 2, 'rationale': "Post mentions Section 144 by District Admin in Guwahati. Time phrase maps to 'D+4', so label is curfew", 'when': 'D+4', 'where': 'Guwahati'}
{'actors': ['Local Police'], 'confidence': 0.87, 'event_type': 'threat', 'priority': 3, 'rationale': "Post mentions blast threat by Local Police in Moradabad. Time phrase maps to 'today', so label is threat", 'when': 'to

conding the evaluate function to see the workin of our model for each fields

In [16]:
def evaluate(true_data, predicted_data):
    fields = ['event_type', 'when', 'where']
    mae_fields = ['confidence', 'priority']
    em_count = 0
    total_em = 0
    exact_match = True
    results = {}


    for field in fields:
        correct = 0
        total = 0
        for true, predicted in zip(true_data, predicted_data):
            true_output = true["output"]
            predicted_output = predicted["output"]

            if true_output and predicted_output:
                true_value = str(true_output.get(field)).strip().lower()
                predicted_value = str(predicted_output.get(field)).strip().lower()

                if true_value == predicted_value:
                    correct += 1
                total += 1

        f1 = correct / total if total else 0

        results[field] = {
            "f1": round(f1,2),
            "correct": correct,
            "total": total,
        }
        # Since the f1 is working same as above so we have comment it out , and its an nlp generation prediction so here fp and fn will be same
        # false positive means model predicted positive but actually its negative , so if model predicted today but in reality its d+2 , what it wil be false postive or false negative ???
          #         if true_value == predicted_value:
        #             tp += 1
        #         else :
        #             fp += 1
        #             fn += 1

        # precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        # recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        # f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # results[field] = {
        #     "f1": f1,
        # }

#   MAE
    for field in mae_fields:
        true_vals = []
        pred_vals = []

        for true, predicted in zip(true_data, predicted_data):
            t_out = true["output"]
            p_out = predicted.get("output")

            if t_out and p_out:
                t_val = t_out.get(field)
                p_val = p_out.get(field)
                true_vals.append(float(t_val) if float(t_val) else 0)
                pred_vals.append(float(p_val) if float(p_val) else 0)


        mae = np.mean(np.abs(np.array(true_vals) - np.array(pred_vals)))
        results[field] = {"mae": float(mae), "count": len(true_vals)}

  #  Exact Match
    for true, predicted in zip(true_data, predicted_data):
        true_output = true["output"]
        predicted_output = predicted["output"]

        if true_output and predicted_output:

          for k in true_output.keys():
            t_val = str(true_output.get(k)).strip()
            p_val = str(predicted_output.get(k)).strip()

            if isinstance(t_val, list):
                if set(t_val) != set(p_val):
                    exact_match = False
                    break
            else:
                if str(t_val).strip() != str(p_val).strip():
                    exact_match = False
                    break
          if exact_match:
            results['exact_match'] = True

    results['exact_match'] = exact_match


    return results


# Comparing Baseline with our Finetuned Model

In [17]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto'
)

def zero_prompting_response(text, tokenizer, model):
    text = preprocess(text)
    messages = [
        {"role": "system", "content": "You are an JSON analyst. Always respond with valid JSON only."},
        {"role": "user", "content": text}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    eos_token_id = tokenizer.eos_token_id
    eot_token_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.1,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=[eos_token_id, eot_token_id],
            repetition_penalty=1.2,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return response

text = "Drivers' Assoc., Housing Rights Group ne rural-6 me sabha announce kiyya, tmrw evening. traffic hoga! #local âš ï¸"

# Baseline
response_baseline = zero_prompting_response(text, tokenizer, base_model)
# Fine tuned model
response_finetuned = zero_prompting_response(text, tokenizer, model)

### Comparing the results

In [18]:
print("Input:", text)
print("Baseline response:", response_baseline)
print('*'*20)
print(' ')
print("Fine-tuned response Without Extracting Json:", response_finetuned)
print(" ")
print("Fine-tuned response With Extracting Json:")
json.dumps(extract_json(response_finetuned))

Input: Drivers' Assoc., Housing Rights Group ne rural-6 me sabha announce kiyya, tmrw evening. traffic hoga! #local âš ï¸
Baseline response: <|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an JSON analyst. Always respond with valid JSON only.<|eot_id|><|start_header_id|>user<|end_header_id|>
Drivers' Assoc., Housing Rights Group ne rural-6 me sabha announce kiyya, tmrw evening. traffic hoga! local âš ï¸<|eot_id|><|start_header_id|>assistant<|end_header_id|>
The following is a list of the most common errors that we see in our daily work and how to fix them.
1) The user has not provided any data for one or more fields (e.g. no name given). This error will be displayed when you try to save your form without entering some information into certain required input elements such as Name, Address etc.. You can use this field validation rule to prevent users from submitting incomplete forms.
2) A value was entered but it does not match what's expected by th

'{"actors": ["Drivers"], "confidence": 0.87, "event_type": "protest", "priority": 3, "rationale": "Post mentions sit-in dharna by Drivers in rural-6. Time phrase maps to \'tomorrow\', so label is protest", "when": "tomorrow", "where": "rural-6"}'

In [19]:
with open("/content/predictions.json",'r',encoding='utf-8') as f:
  data = json.load(f)
# true_test_slice = dataset['test'][:14]
true_completions = dataset['test']['completion']
true_data = [{'output': json.loads(comp)} for comp in true_completions]
result = evaluate(true_data,data)
result

{'event_type': {'f1': 1.0, 'correct': 12, 'total': 12},
 'when': {'f1': 0.58, 'correct': 7, 'total': 12},
 'where': {'f1': 1.0, 'correct': 12, 'total': 12},
 'confidence': {'mae': 0.04166666666666665, 'count': 12},
 'priority': {'mae': 0.16666666666666666, 'count': 12},
 'exact_match': False}

Saving the weights of our model

In [None]:
lora = "lora_adapters"
model.save_pretrained(lora)
tokenizer.save_pretrained(lora)



('lora_adapters/tokenizer_config.json',
 'lora_adapters/special_tokens_map.json',
 'lora_adapters/chat_template.jinja',
 'lora_adapters/tokenizer.json')

# Model Merging
Importing base model for merging

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
                                                  model_name,
                                                  device_map= "auto",
                                                  trust_remote_code =True
                                                  )

In [None]:
from peft import PeftModel

base_model =PeftModel.from_pretrained(base_model,lora)
base_model = base_model.merge_and_unload()
merged_model = base_model.eval()

merge_dir = "./merged_model"

merged_model.save_pretrained(merge_dir, safe_serialization=True)
tokenizer = AutoTokenizer.from_pretrained(merge_dir)
tokenizer.save_pretrained(merge_dir)


In [None]:
text = " Nagpur me curfew? kisi ne bola aaj sham. Source?? unverified forwarded msg. details: jagah fix hai par permission nahi mili सूचना मिली par confirm nahi. PS : Rumor"

res = generate_response(text,tokenizer,merged_model)
print(res)