## Downloading the required libraries

In [1]:
!pip install transformers bitsandbytes peft trl datasets huggingface_hub

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, trl
Successfully installed bitsandbytes-0.48.1 trl-0.24.0


In [2]:
import re,math, pandas as pd
from datasets import Dataset,DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig,get_peft_model
from torch.utils.data import DataLoader
from huggingface_hub import snapshot_download,login
import torch
import json
import numpy as np

Importing our data and converting it in the dataframe ,so we can preprocess effectively using apply method in pandas

In [3]:
train_path = "/content/train.jsonl"
val_path = "/content/dev.jsonl"
test_path = "/content/test.jsonl"

train = pd.read_json(train_path, lines=True)
val = pd.read_json(val_path, lines=True)
test = pd.read_json(test_path, lines=True)
train.sample(5)

Unnamed: 0,input,output
164,Kisi ne bola Jabalpur me kisi ne bola कल शाम. ...,"{'event_type': 'rumor', 'when': 'tomorrow', 'w..."
163,Nagpur me curfew? kisi ne bola aaj sham. Sourc...,"{'event_type': 'rumor', 'when': 'today', 'wher..."
20,Blast threat call at Mangaluru station कल दोपह...,"{'event_type': 'threat', 'when': 'tomorrow', '..."
47,District Admin: curfeu from day-after evening ...,"{'event_type': 'curfew', 'when': 'D+2', 'where..."
190,Curfew extended in Agra starting day after 10 ...,"{'event_type': 'curfew', 'when': 'D+2', 'where..."


Using regex for text proprocessing

In [4]:
import re

def preprocess(text):
  text = re.sub(r'\s+',' ',text)
  text = re.sub(r'[\U0001F600-\U0001F64F|\U0001F300-\U0001F5FF|\U0001F680-\U0001F6FF|\U0001F700-\U0001F77F|\U0001F780-\U0001F7FF]',' ',text)
  text = text.replace(r'\n',' ')
  text = text.replace(r'\t',' ')
  text = re.sub(r'#+','',text)
  text = re.sub(r'http\S+|@\w+','',text)
  text = re.sub(r'\s+', ' ', text)
  text = text.strip()

  return text


Applying the preprocessing

In [5]:
train['input'] = train['input'].apply(preprocess)
val['input'] = val['input'].apply(preprocess)
test['input'] = test['input'].apply(preprocess)

In [6]:
login(token="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")

In [7]:
model_name =  'meta-llama/Llama-3.2-1B'

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

General Autotokenizer was causing problem trying to import "additional chat template" so used the snapshot_download to only download the required files

In [8]:
local_dir = snapshot_download(
  repo_id=model_name,
  revision='main',
  allow_patterns=[
      'tokenizer*',
      'vocab',
      '*.model',
      'special_tokens_map.json',
      'tokenizer_config.json',
      'tokenizer.json'
  ]

)

tokenizer = AutoTokenizer.from_pretrained(
    local_dir,
    device_map='auto',
    local_files_only=True,
    padding_side="left",
    use_fast=True,

)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

original/tokenizer.model:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

Converting it into the Dict form so we easily make our chat template

In [9]:
trainds = Dataset.from_pandas(train, preserve_index=False)
valds = Dataset.from_pandas(val, preserve_index=False)
testds = Dataset.from_pandas(test, preserve_index=False)

dataset = DatasetDict({
    "train": trainds,
    "val": valds,
    "test": testds
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 210
    })
    val: Dataset({
        features: ['input', 'output'],
        num_rows: 45
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 45
    })
})


Since we are using the base model so it does not have the chattemplate pre bulit so we need to make it for your usecase

In [10]:
tokenizer.pad_token = tokenizer.eos_token

tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'system' %}<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{{ message['content'] }}<|eot_id|>{% elif message['role'] == 'user' %}<|start_header_id|>user<|end_header_id|>
{{ message['content'] }}<|eot_id|>{% elif message['role'] == 'assistant' %}<|start_header_id|>assistant<|end_header_id|>
{{ message['content'] }}<|eot_id|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|start_header_id|>assistant<|end_header_id|>
{% endif %}"""

special_tokens = {
    'additional_special_tokens': [
        '<|begin_of_text|>',
        '<|start_header_id|>',
        '<|end_header_id|>',
        '<|eot_id|>',
    ]
}

num_tokenizers_added = tokenizer.add_special_tokens(special_tokens)
eot_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")
model.resize_token_embeddings(len(tokenizer)) # this one for the model so it will not throw error cause we have increased the size of our vocalbulary

def chat_template(text):
    output_str = json.dumps(text["output"], ensure_ascii=False)
    messages = [
        {"role": "system", "content": "You are an JSON analyst. Always respond with valid JSON only."},
        {"role": "user", "content": text["input"]},
        {"role": "assistant", "content": output_str},
    ]
    chat_str = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
    )
    return {"text": chat_str}

dataset = dataset.map(chat_template)
dataset

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'text'],
        num_rows: 210
    })
    val: Dataset({
        features: ['input', 'output', 'text'],
        num_rows: 45
    })
    test: Dataset({
        features: ['input', 'output', 'text'],
        num_rows: 45
    })
})

now we are tokenizing our dataset

here we are removing the columns because we need the new inputs_ids and attention_mask columns for training


In [11]:
def tokenize_func(batch):
  return tokenizer(
      batch['text'],
      truncation=False,
      max_length=512,
      padding=False,
  )

remove_cols = list(dataset["train"].column_names)
tokenize_dataset = dataset.map(tokenize_func,batched=True,remove_columns=remove_cols)


Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)

train_loader = DataLoader(tokenize_dataset['train'],batch_size=2,shuffle=True,collate_fn=data_collator)

In [13]:
lora_config = LoraConfig(
    r=128,
    lora_alpha=64,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj", "up_proj"],
    lora_dropout=0.07,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model,lora_config)
model.print_trainable_parameters()

trainable params: 69,206,016 || all params: 1,305,020,416 || trainable%: 5.3031


In [14]:
sft_config = SFTConfig(
    output_dir="./finetuned",
    num_train_epochs=3,
    max_steps=120,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_steps=10,
    eval_strategy="steps",
    save_steps=20,
    save_strategy='steps',
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_steps=2,
    fp16=True,
    gradient_accumulation_steps=4,
    logging_steps=10,
    report_to="none",
    optim="adamw_8bit",
)

In [15]:
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=tokenize_dataset['train'],
    eval_dataset=tokenize_dataset['val'],
    data_collator=data_collator,
    processing_class=tokenizer,
)

trainer.train()

Truncating train dataset:   0%|          | 0/210 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/45 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
10,4.8503,4.265243,4.014737,5557.0,0.412368
20,3.8174,3.244546,3.325428,11102.0,0.51935
30,2.8477,2.437704,2.376123,16558.0,0.656521
40,2.247,2.021593,1.892096,22158.0,0.722629
50,1.8932,1.845771,1.757955,27662.0,0.738683
60,1.8155,1.732336,1.651762,32998.0,0.755949
70,1.6148,1.65407,1.566248,38534.0,0.760964
80,1.561,1.593876,1.518115,44052.0,0.767252
90,1.5511,1.550537,1.502471,49608.0,0.770972
100,1.4493,1.525594,1.496354,55084.0,0.775555


TrainOutput(global_step=120, training_loss=2.2029616117477415, metrics={'train_runtime': 320.6775, 'train_samples_per_second': 1.497, 'train_steps_per_second': 0.374, 'total_flos': 412196064313344.0, 'train_loss': 2.2029616117477415, 'epoch': 2.2666666666666666})

#Mode Evaluation
Creating Prediction.json for the model evaluation

In [16]:
def extract_json(text):
    text = text.strip()
    idx = text.find('{')
    if idx == -1:
        return None

    count = 0
    in_string = False
    esc = False

    for i in range(idx, len(text)):
        char = text[i]
        if char == '"' and not esc:
            in_string = not in_string
        elif char == '\\' and in_string:
            esc = not esc
            continue
        else :
          esc = False

        if not in_string:
            if char == '{':
                count += 1
            elif char == '}':
                count -= 1
                if count == 0:
                    json_str = text[idx:i+1]
                    return json.loads(json_str)

    return None

output_data = []

for idx,row in enumerate(dataset['test']):
    if idx == 14:
      break

    text = preprocess(row["input"])

    messages = [
        {"role": "system", "content": "You are an JSON analyst. Always respond with valid JSON only."},
        {"role": "user", "content": text}
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    eos_token_id = tokenizer.eos_token_id
    eot_token_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.1,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=[eos_token_id, eot_token_id],
            repetition_penalty=1.2,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    json_output = extract_json(response)

    output_data.append({
      "input": row["input"],
      "output": json_output
    })
    print(json_output)

with open('predictions.json', 'w') as f:
    json.dump(output_data, f, indent=2)

print("Predictions saved")


{'actors': ['Local Police'], 'confidence': 0.87, 'event_type': 'threat', 'priority': 3, 'rationale': "Post mentions email threat by Local Police in Srinagar. Time phrase maps to 'D+2', so label is threat", 'when': 'D+2', 'where': 'Srinagar'}
{'actors': ['District Admin'], 'confidence': 0.87, 'event_type': 'protest', 'priority': 3, 'rationale': "Post mentions marcha by District Admin in Sector-5. Time phrase maps to 'tomorrow', so label is protest", 'when': 'tomorrow', 'where': 'Sector-5'}
{'actors': ['Local Police'], 'confidence': 0.87, 'event_type': 'curfew', 'priority': 2, 'rationale': "Post mentions curfew by Local Police in Guwahati. Time phrase maps to 'today', so label is curfew", 'when': 'tomorrow', 'where': 'Guwahati'}
{'actors': ['Local Police'], 'confidence': 0.87, 'event_type': 'threat', 'priority': 3, 'rationale': "Post mentions blast by Local Police in Moradabad. Time phrase maps to 'today', so label is threat", 'when': 'tomorrow', 'where': 'Moradabad'}
{'actors': ['Local 

conding the evaluate function to see the workin of our model for each fields

In [25]:
def evaluate(true_data, predicted_data):
    fields = ['event_type', 'when', 'where']
    mae_fields = ['confidence', 'priority']
    results = {}

    for field in fields:
        total = 0
        correct = 0
        true_vals, pred_vals = [], []

        for true, predicted in zip(true_data, predicted_data):
            true_output = true["output"]
            predicted_output = predicted["output"]

            if true_output and predicted_output:
                true_value = str(true_output.get(field)).strip().lower()
                predicted_value = str(predicted_output.get(field)).strip().lower()

                if true_value == predicted_value:
                    correct += 1
                total += 1

        accuracy = correct / total if total else 0

        results[field] = {
            "accuracy": accuracy,
            "correct": correct,
            "total": total,
        }

    for field in mae_fields:
        true_vals = []
        pred_vals = []

        for true, predicted in zip(true_data, predicted_data):
            t_out = true.get("output")
            p_out = predicted.get("output")

            if t_out and p_out:
                t_val = t_out.get(field)
                p_val = p_out.get(field)
                true_vals.append(float(t_val))
                pred_vals.append(float(p_val))


        mae = np.mean(np.abs(np.array(true_vals) - np.array(pred_vals)))
        results[field] = {"mae": float(mae), "count": len(true_vals)}

    overall = sum(r['correct'] for r in results.values()) / sum(r['total'] for r in results.values())
    results['overall'] = overall

    return results


We are importing our predictions.json to validate it against our validation dataset

In [30]:
with open("/content/predictions.json",'r',encoding='utf-8') as f:
  data = json.load(f)

result = evaluate(dataset['test'],data)
result

{'event_type': {'accuracy': 0.9285714285714286, 'correct': 13, 'total': 14},
 'when': {'accuracy': 0.14285714285714285, 'correct': 2, 'total': 14},
 'where': {'accuracy': 1.0, 'correct': 14, 'total': 14},
 'confidence': {'mae': 0.12785714285714286, 'count': 14},
 'priority': {'mae': 0.5, 'count': 14},
 'overall': 0.6904761904761905}

Here we can test out the working of our model

In [31]:
def generate_response(text, tokenizer, model):
    text = preprocess(text)
    messages = [
        {"role": "system", "content": "You are an JSON analyst. Always respond with valid JSON only."},
        {"role": "user", "content": text}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    eos_token_id = tokenizer.eos_token_id
    eot_token_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.1,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=[eos_token_id, eot_token_id],
            repetition_penalty=1.2,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    return extract_json(response)

text = "Drivers' Assoc., Housing Rights Group ne rural-6 me sabha announce kiyya, tmrw evening. traffic hoga! #local âš ï¸"
response = generate_response(text, tokenizer, model)
print('*'*6," Model Response : ",'*'*6)
print(json.dumps(response,indent=2))

******  Model Response :  ******
{
  "actors": [
    "Housing Rights Group"
  ],
  "confidence": 0.87,
  "event_type": "rally",
  "priority": 3,
  "rationale": "Post mentions roadshow by HRG in Rural-6. Time phrase maps to 'tomorrow', so label is rally",
  "when": "tomorrow",
  "where": "Rural-6"
}


Saving the weights of our model

In [20]:
lora = "lora_adapters"
model.save_pretrained(lora)
tokenizer.save_pretrained(lora)

('lora_adapters/tokenizer_config.json',
 'lora_adapters/special_tokens_map.json',
 'lora_adapters/chat_template.jinja',
 'lora_adapters/tokenizer.json')

# Model Merging
Importing base model for merging

In [21]:
base_model = AutoModelForCausalLM.from_pretrained(
                                                  model_name,
                                                  device_map= "auto",
                                                  trust_remote_code =True
                                                  )

In [None]:
from peft import PeftModel

base_model =PeftModel.from_pretrained(base_model,lora)
base_model = base_model.merge_and_unload()
merged_model = base_model.eval()

merge_dir = "./merged_model"

merged_model.save_pretrained(merge_dir, safe_serialization=True)
tokenizer = AutoTokenizer.from_pretrained(merge_dir)
tokenizer.save_pretrained(merge_dir)


In [None]:
text = " Nagpur me curfew? kisi ne bola aaj sham. Source?? unverified forwarded msg. details: jagah fix hai par permission nahi mili सूचना मिली par confirm nahi. PS : Rumor"

res = generate_response(text,tokenizer,merged_model)
print(res)