# Complaint NER Fine-Tuning
This notebook fine-tunes a BERT-based model to extract complainants and accused from complaint texts.

In [1]:
!pip install spacy



In [2]:
import pandas as pd
df=pd.read_csv("/content/generated_complaint_dataset_2500.csv")
df.head()

Unnamed: 0,complaint_text,complainant,accused
0,Ms. Priya Sharma has accused Inspector General...,Ms. Priya Sharma,Inspector General
1,Journalists Association of Delhi filed a repor...,Journalists Association of Delhi,Government Official
2,Complaint submitted by Smt. Sunita Gupta alleg...,Smt. Sunita Gupta,Inspector General
3,Mr. Arvind Yadav raised concerns regarding beh...,Mr. Arvind Yadav,Tehsildar
4,Ms. Priya Sharma has accused Government Offici...,Ms. Priya Sharma,Government Official


In [4]:
import re
from typing import List, Tuple
import spacy

# Load a tokenizer
nlp = spacy.load("en_core_web_sm")

def bio_tagging(text: str, complainant: str, accused: str) -> List[Tuple[str, str]]:
    doc = nlp(text)
    tokens = [token.text for token in doc]

    comp_words = complainant.split()
    acc_words = accused.split()

    tags = ["O"] * len(tokens)

    def label_span(name_words, label_prefix):
        name_len = len(name_words)
        for i in range(len(tokens) - name_len + 1):
            if tokens[i:i+name_len] == name_words:
                tags[i] = f"B-{label_prefix}"
                for j in range(1, name_len):
                    tags[i + j] = f"I-{label_prefix}"
                break
    label_span(comp_words, "COMPLAINANT")
    label_span(acc_words, "ACCUSED")

    return list(zip(tokens, tags))

# Example
text = "John Doe filed a complaint against Jane Smith for fraud."
complainant = "John Doe"
accused = "Jane Smith"

annotated = bio_tagging(text, complainant, accused)

for token, tag in annotated:
    print(f"{token}\t{tag}")


John	B-COMPLAINANT
Doe	I-COMPLAINANT
filed	O
a	O
complaint	O
against	O
Jane	B-ACCUSED
Smith	I-ACCUSED
for	O
fraud	O
.	O


In [6]:
df['bio_tagged']=df.apply(lambda row: bio_tagging(row['complaint_text'], row['complainant'], row['accused']), axis=1)

In [12]:
df.iloc[15]['bio_tagged']

[('Suo', 'O'),
 ('-', 'O'),
 ('Motu', 'O'),
 ('cognizance', 'O'),
 ('with', 'O'),
 ('regard', 'O'),
 ('to', 'O'),
 ('Inspector', 'B-ACCUSED'),
 ('General', 'I-ACCUSED'),
 ('allegedly', 'O'),
 ('harassing', 'O'),
 ('Media', 'B-COMPLAINANT'),
 ('Council', 'I-COMPLAINANT'),
 ('of', 'I-COMPLAINANT'),
 ('India', 'I-COMPLAINANT'),
 ('during', 'O'),
 ('coverage', 'O'),
 ('of', 'O'),
 ('an', 'O'),
 ('event', 'O'),
 ('.', 'O')]

In [14]:
all_annotated=[]
for idx, row in df.iterrows():
    annotations = bio_tagging(row['complaint_text'], row['complainant'], row['accused'])
    all_annotated.append(annotations)

In [15]:
with open("ner_data.conll", "w") as f:
    for sentence in all_annotated:
        for token, tag in sentence:
            f.write(f"{token} {tag}\n")
        f.write("\n")

In [None]:
!pip install transformers datasets seqeval evaluate


In [18]:
import random

def split_conll_file(input_path, train_path, val_path, train_frac=0.8, seed=42):
    with open(input_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")  # each sentence is separated by double newline

    random.seed(seed)
    random.shuffle(sentences)

    split_idx = int(len(sentences) * train_frac)
    train_sentences = sentences[:split_idx]
    val_sentences = sentences[split_idx:]

    with open(train_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(train_sentences) + "\n")

    with open(val_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(val_sentences) + "\n")

    print(f"Done: {len(train_sentences)} train, {len(val_sentences)} val")

# Example usage
split_conll_file(
    input_path="ner_data.conll",
    train_path="train.conll",
    val_path="val.conll"
)


Done: 2000 train, 500 val


In [22]:
from datasets import load_dataset, Dataset


def parse_conll(filepath):
    data = []
    tokens = []
    ner_tags = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                token, tag = line.split()
                tokens.append(token)
                ner_tags.append(tag)
            else:
                # Empty line indicates end of a sentence
                data.append({"tokens": tokens, "ner_tags": ner_tags})
                tokens = []
                ner_tags = []
    # Add the last sentence if the file doesn't end with a blank lin
    if tokens:
         data.append({"tokens": tokens, "ner_tags": ner_tags})
    return data

train_data = parse_conll("/content/train.conll")
val_data = parse_conll("/content/val.conll")


train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

# Combine into a DatasetDict
dataset = {"train": train_dataset, "validation": val_dataset}

print("Dataset loaded successfully:")
print(dataset)
print(dataset["train"][0])

Dataset loaded successfully:
{'train': Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 2000
}), 'validation': Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 500
})}
{'tokens': ['Complaint', 'submitted', 'by', 'Editor', 'of', 'The', 'Daily', 'News', 'alleging', 'harassment', 'by', 'Zonal', 'Commissioner', '.'], 'ner_tags': ['O', 'O', 'O', 'B-COMPLAINANT', 'I-COMPLAINANT', 'I-COMPLAINANT', 'I-COMPLAINANT', 'I-COMPLAINANT', 'O', 'O', 'O', 'B-ACCUSED', 'I-ACCUSED', 'O']}


In [25]:
from datasets import load_dataset, Dataset, DatasetDict
import random
import pandas as pd
import re
from typing import List, Tuple
import spacy


label_list = list(set(label for labels in dataset['train']['ner_tags'] for label in labels))
label_list.sort()
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
# Convert string labels to integer ids
def encode_labels(example):
    # Ensure 'ner_tags' key exists before accessing it
    if "ner_tags" in example:
        example["labels"] = [label2id[label] for label in example["ner_tags"]]
    return example


dataset = DatasetDict(dataset)

dataset = dataset.map(encode_labels)

print("Dataset after mapping labels:")
print(dataset)
print(dataset["train"][0])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset after mapping labels:
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'labels'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'labels'],
        num_rows: 500
    })
})
{'tokens': ['Complaint', 'submitted', 'by', 'Editor', 'of', 'The', 'Daily', 'News', 'alleging', 'harassment', 'by', 'Zonal', 'Commissioner', '.'], 'ner_tags': ['O', 'O', 'O', 'B-COMPLAINANT', 'I-COMPLAINANT', 'I-COMPLAINANT', 'I-COMPLAINANT', 'I-COMPLAINANT', 'O', 'O', 'O', 'B-ACCUSED', 'I-ACCUSED', 'O'], 'labels': [4, 4, 4, 1, 3, 3, 3, 3, 4, 4, 4, 0, 2, 4]}


In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

def tokenize_and_align_labels(example):

    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:

        if word_idx is None:
            labels.append(-100)

        elif word_idx != previous_word_idx:

            if word_idx < len(example["labels"]):
                 labels.append(example["labels"][word_idx])
            else:
                 labels.append(-100)

        else:
            if word_idx < len(example["labels"]):
                 labels.append(example["labels"][word_idx])
            else:

                 labels.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)

print("Tokenized datasets after mapping:")
print(tokenized_datasets)
print(tokenized_datasets["train"][0])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenized datasets after mapping:
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 500
    })
})
{'tokens': ['Complaint', 'submitted', 'by', 'Editor', 'of', 'The', 'Daily', 'News', 'alleging', 'harassment', 'by', 'Zonal', 'Commissioner', '.'], 'ner_tags': ['O', 'O', 'O', 'B-COMPLAINANT', 'I-COMPLAINANT', 'I-COMPLAINANT', 'I-COMPLAINANT', 'I-COMPLAINANT', 'O', 'O', 'O', 'B-ACCUSED', 'I-ACCUSED', 'O'], 'labels': [-100, 4, 4, 4, 4, 4, 4, 1, 3, 3, 3, 3, 4, 4, 4, 0, 0, 2, 4, -100], 'input_ids': [101, 3291, 8223, 15858, 1204, 7402, 1118, 9057, 1104, 1109, 5732, 3128, 26099, 17514, 1118, 163, 24059, 6520, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1

In [31]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate

model = AutoModelForTokenClassification.from_pretrained(
    "dslim/bert-base-NER",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,

    ignore_mismatched_sizes=True
)

metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

training_args = TrainingArguments(
    output_dir="./ner-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer, # Pass the tokenizer here
    compute_metrics=compute_metrics,
    # Add the data collator here
    data_collator=data_collator
)

trainer.train()

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.000195,1.0,1.0,1.0,1.0
2,0.031700,9.1e-05,1.0,1.0,1.0,1.0
3,0.031700,6.5e-05,1.0,1.0,1.0,1.0
4,0.000400,5.2e-05,1.0,1.0,1.0,1.0
5,0.000400,4.8e-05,1.0,1.0,1.0,1.0


TrainOutput(global_step=1250, training_loss=0.012928000819683075, metrics={'train_runtime': 204.2842, 'train_samples_per_second': 48.951, 'train_steps_per_second': 6.119, 'total_flos': 142692313099440.0, 'train_loss': 0.012928000819683075, 'epoch': 5.0})

In [46]:
text = "Suo-Motu cognizance with re­ gard to alleged attack and killing of Shri Sayed Adil Wahab, Reporter in Bhopal, M.P. "

inputs = tokenizer(text, return_tensors="pt")

inputs = {k: v.to(model.device) for k, v in inputs.items()}

outputs = model(**inputs)
logits = outputs.logits
predictions = logits.argmax(-1)


predicted_labels = [id2label[p.item()] for p in predictions[0].cpu()]

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())

b_output=[]
for token, label in zip(tokens, predicted_labels):
    b_output.append((token,label))

In [48]:
def extract_entities_from_bert_output(lines):
    entities = {"COMPLAINANT": [], "ACCUSED": []}
    current_tokens = []
    current_label = None

    def flush():
        if current_tokens and current_label:
            entity = "".join(current_tokens).replace("##", "")
            entities[current_label].append(entity.strip())
            current_tokens.clear()

    for line in lines:
        if not line:
            flush()
            current_label = None
            continue
        token, label = line

        if token in ["[CLS]", "[SEP]"]:
            continue

        if label.startswith("B-"):
            flush()
            current_label = label[2:]
            current_tokens = [token]
        elif label.startswith("I-") and current_label == label[2:]:
            current_tokens.append(token)
        else:
            flush()
            current_label = None
            current_tokens = []

    flush()
    return entities


In [49]:
entities = extract_entities_from_bert_output(b_output)
print("Complainants:", entities["COMPLAINANT"])
print("Accused:", entities["ACCUSED"])

Complainants: ['ShriSayedAdilWahab', 'M', '.P']
Accused: ['Reporter', 'B', 'hop', 'al']
