In [1]:
!pip install evaluate seqeval wikipedia





In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# ======================================================
# SECTION 1 ‚Äî IMPORTS & SETUP (CORRECT COLAB VERSION)
# ======================================================

import os
import pandas as pd
import numpy as np
import ast
import json
import random
import re
from pathlib import Path
from collections import defaultdict
import warnings

warnings.filterwarnings('ignore')

# ============================================
# COLAB: MOUNT GOOGLE DRIVE
# ============================================
from google.colab import drive
drive.mount('/content/drive')

# ============================================
# Core ML libraries
# ============================================
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import (
    XLMRobertaTokenizerFast,
    XLMRobertaForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import evaluate
import nltk
import wikipedia

# ============================================
# NLTK downloads
# ============================================
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# ============================================
# Reproducibility
# ============================================
random.seed(42)
np.random.seed(42)

# ============================================
# Set ROOT directory (YOUR DATA IS HERE)
# ============================================
ROOT = Path("/content/drive/MyDrive/NER_RETRAIN")
ROOT.mkdir(parents=True, exist_ok=True)

print("‚úÖ All imports loaded successfully!")
print(f"üìÅ ROOT directory: {ROOT}")
print("üéØ Environment ready for multilingual NER training!")

# ============================================
# DEVICE CHECK
# ============================================
import torch

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print("\nüöÄ GPU detected:", torch.cuda.get_device_name(0))
else:
    DEVICE = torch.device("cpu")
    print("\n‚ö† Running on CPU (slower but fine for testing)")

print("Using device:", DEVICE)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ All imports loaded successfully!
üìÅ ROOT directory: /content/drive/MyDrive/NER_RETRAIN
üéØ Environment ready for multilingual NER training!

üöÄ GPU detected: Tesla T4
Using device: cuda


In [4]:
# ======================================================
# SECTION 2 ‚Äî LOAD BASE DATASETS
# ======================================================

print("üìä Loading base datasets...")

# Set dataset paths inside Google Drive
HINDI_NER_PATH = "/content/drive/MyDrive/NER_RETRAIN/ner.csv"
INDIA_LOC_PATH = "/content/drive/MyDrive/NER_RETRAIN/India District (1).xlsx"

# Load Hindi NER dataset
hindi_df = pd.read_csv(HINDI_NER_PATH)
print(f"‚úÖ Loaded Hindi NER: {len(hindi_df)} examples")
print(f"üìã Hindi columns: {list(hindi_df.columns)}")

# Load India locations dataset
india_df = pd.read_excel(INDIA_LOC_PATH)
print(f"‚úÖ Loaded India locations: {len(india_df)} rows")
print(f"üìã India columns: {list(india_df.columns)}")

# Display sample data
print("\nüìã Hindi NER Sample:")
hindi_df.head(3)


üìä Loading base datasets...
‚úÖ Loaded Hindi NER: 47959 examples
üìã Hindi columns: ['Sentence #', 'Sentence', 'POS', 'Tag']
‚úÖ Loaded India locations: 719 rows
üìã India columns: ['State Code', 'State Name', 'District Code', 'District Name']

üìã Hindi NER Sample:


Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [5]:
# ======================================================
# SECTION 3 ‚Äî PREPROCESS HINDI NER DATA
# ======================================================

import ast

print("üîß Preprocessing Hindi NER dataset...")

# Convert string list columns ‚Üí Python lists
def safe_parse_list(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

hindi_df["POS"] = hindi_df["POS"].apply(safe_parse_list)
hindi_df["Tag"] = hindi_df["Tag"].apply(safe_parse_list)

# Basic cleaning of sentence
hindi_df["Sentence"] = hindi_df["Sentence"].apply(
    lambda x: x.replace("Sentence: ", "").strip()
)

print("üîç Sample after conversion:")
print(hindi_df.head(2))

# Check token alignment
bad_rows = hindi_df[
    hindi_df.apply(lambda row: len(row["POS"]) != len(row["Tag"]), axis=1)
]

print(f"\n‚ö† Misaligned rows: {len(bad_rows)}")
print("If > 0, we will fix in next step.")


üîß Preprocessing Hindi NER dataset...
üîç Sample after conversion:
    Sentence #                                           Sentence  \
0  Sentence: 1  Thousands of demonstrators have marched throug...   
1  Sentence: 2  Families of soldiers killed in the conflict jo...   

                                                 POS  \
0  [NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...   
1  [NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...   

                                                 Tag  
0  [O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...  
1  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  

‚ö† Misaligned rows: 0
If > 0, we will fix in next step.


In [6]:
# ======================================================
# SECTION 4 ‚Äî LOAD CoNLL ENGLISH DATASETS (FIXED FOR COLAB)
# ======================================================

print("\nüì• Loading CoNLL datasets...")

def load_conll_file(path):
    sentences, ner_tags = [], []
    tokens, tags = [], []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if not line:
                # Sentence boundary
                if tokens:
                    sentences.append(tokens)
                    ner_tags.append(tags)
                    tokens, tags = [], []
                continue

            parts = line.split()
            tokens.append(parts[0])
            tags.append(parts[-1])

    return sentences, ner_tags

# CORRECT PATH FOR COLAB
conll_path = "/content/drive/MyDrive/NER_RETRAIN/conll2003"

train_sentences, train_tags = load_conll_file(f"/content/drive/MyDrive/NER_RETRAIN/eng.train")
valid_sentences, valid_tags = load_conll_file(f"/content/drive/MyDrive/NER_RETRAIN/eng.testa")
test_sentences, test_tags = load_conll_file(f"/content/drive/MyDrive/NER_RETRAIN/eng.testb")

print(f"‚úÖ CoNLL Train: {len(train_sentences)} sentences")
print(f"‚úÖ CoNLL Valid: {len(valid_sentences)} sentences")
print(f"‚úÖ CoNLL Test: {len(test_sentences)} sentences")

# Show a sample
print("\nüìã Sample CoNLL sentence:")
print(train_sentences[100])
print(train_tags[100])



üì• Loading CoNLL datasets...
‚úÖ CoNLL Train: 14987 sentences
‚úÖ CoNLL Valid: 3466 sentences
‚úÖ CoNLL Test: 3684 sentences

üìã Sample CoNLL sentence:
['Port', 'conditions', 'from', 'Lloyds', 'Shipping', 'Intelligence', 'Service', '--']
['O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O']


In [7]:
# ======================================================
# SECTION 5 ‚Äî LOAD SYNTHETIC + COMPREHENSIVE DATASET
# ======================================================

print("üì• Loading extended synthetic dataset...")

COMPREHENSIVE_PATH = "/content/drive/MyDrive/NER_RETRAIN/comprehensive_indian_ner_dataset.csv"

# Try loading comprehensive dataset
try:
    comprehensive_df = pd.read_csv(COMPREHENSIVE_PATH)
    print(f"‚úÖ Loaded comprehensive dataset: {len(comprehensive_df)} examples")

except Exception as e:
    comprehensive_df = None
    print(f"‚ö†Ô∏è Could not load comprehensive dataset: {e}")

# Extract all states and districts anyway
indian_states = set(india_df["State Name"].dropna().unique())
indian_districts = set(india_df["District Name"].dropna().unique())
all_indian_locations = list(indian_states.union(indian_districts))

print(f"üìç States: {len(indian_states)}")
print(f"üìç Districts: {len(indian_districts)}")
print(f"üéØ Total unique Indian locations: {len(all_indian_locations)}")

# Clean/normalize tags (B-LOC / I-LOC / O)
def normalize_tag(tag):
    tag = str(tag).upper() if not pd.isna(tag) else "O"

    if any(k in tag for k in ["LOC", "GEO", "GPE"]):
        if tag.startswith("B-") or tag.startswith("B_"):
            return "B-LOC"
        elif tag.startswith("I-") or tag.startswith("I_"):
            return "I-LOC"
    return "O"

print("‚úÖ Tag normalization function ready!")


üì• Loading extended synthetic dataset...
‚úÖ Loaded comprehensive dataset: 3992 examples
üìç States: 36
üìç Districts: 712
üéØ Total unique Indian locations: 748
‚úÖ Tag normalization function ready!


In [8]:
# Process Hindi dataset
print("üîÑ Processing Hindi NER dataset...")

hindi_sentences = []
hindi_tags_normalized = []

for idx, row in hindi_df.iterrows():
    try:
        # Parse sentence and tags
        tokens = row['Sentence'].split()
        tags = ast.literal_eval(row['Tag']) if isinstance(row['Tag'], str) else row['Tag']

        # Ensure equal length
        if len(tokens) == len(tags):
            # Normalize tags
            normalized_tags = [normalize_tag(tag) for tag in tags]
            hindi_sentences.append(tokens)
            hindi_tags_normalized.append(normalized_tags)
    except Exception as e:
        continue  # Skip problematic rows

print(f"‚úÖ Processed {len(hindi_sentences)} Hindi sentences")

# Process CoNLL datasets
print("üîÑ Processing CoNLL datasets...")

conll_sentences = train_sentences + valid_sentences + test_sentences
conll_tags_raw = train_tags + valid_tags + test_tags

conll_tags_normalized = []
for tag_sequence in conll_tags_raw:
    normalized_tags = [normalize_tag(tag) for tag in tag_sequence]
    conll_tags_normalized.append(normalized_tags)

print(f"‚úÖ Processed {len(conll_sentences)} CoNLL sentences")

# Show tag distribution
all_tags_flat = []
for seq in hindi_tags_normalized + conll_tags_normalized:
    all_tags_flat.extend(seq)

tag_counts = pd.Series(all_tags_flat).value_counts()
print(f"\nüìä Tag distribution in processed data:")
print(tag_counts)
print(f"üìà Location coverage: {(tag_counts['B-LOC'] + tag_counts['I-LOC']) / len(all_tags_flat) * 100:.1f}%")

üîÑ Processing Hindi NER dataset...
‚úÖ Processed 47955 Hindi sentences
üîÑ Processing CoNLL datasets...
‚úÖ Processed 22137 CoNLL sentences

üìä Tag distribution in processed data:
O        1277795
B-LOC      64158
I-LOC       9283
Name: count, dtype: int64
üìà Location coverage: 5.4%


In [9]:
# ======================================================
# ADVANCED SYNTHETIC DATA GENERATION + DATASET PREP
# (Run this in your Colab after the preprocessing cells)
# ======================================================

import random
from pathlib import Path
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import XLMRobertaTokenizerFast
import json

ROOT = Path("/content/drive/MyDrive/NER_RETRAIN")
ROOT.mkdir(parents=True, exist_ok=True)

print("üöÄ Starting ADVANCED synthetic generation and dataset prep...")

# --- 1) Build list of unique states & districts from india_df ---
india_df_clean = india_df.copy()
india_df_clean["State Name"] = india_df_clean["State Name"].astype(str).str.strip()
india_df_clean["District Name"] = india_df_clean["District Name"].astype(str).str.strip()

# create pairs (district, state)
pairs = india_df_clean[["District Name", "State Name"]].dropna().drop_duplicates().values.tolist()
print(f"üìç Unique district-state pairs: {len(pairs)}")

# --- 2) Templates (5 variations) ---
templates = [
    "{district} is located in {state}.",
    "People living in {district} district of {state} often say ...",
    "{state} includes {district} as one of its districts.",
    "{district}, {state}, is known for its culture and history.",
    "Travelers often visit {district} in {state} during holidays."
]

# --- 3) Generate synthetic sentences (5 variations per pair) ---
synthetic_sentences = []
synthetic_tags = []

for district, state in pairs:
    district = str(district).strip()
    state = str(state).strip()
    for t in templates:
        sent = t.format(district=district, state=state)
        tokens = sent.split()
        tags = []
        # create tagging: mark whole district as B-LOC I-LOC... and state as B-LOC I-LOC...
        # We detect token-by-token equality to district/state (exact token match)
        # For multi-token names (e.g., "North Andaman"), we detect contiguous token spans.
        # Simple approach: find span of district tokens and state tokens in sentence tokens
        def find_span(name_tokens, tokens):
            n = len(name_tokens)
            for i in range(len(tokens) - n + 1):
                if tokens[i:i+n] == name_tokens:
                    return i, i+n  # start inclusive, end exclusive
            return None

        toks = tokens
        district_tokens = district.split()
        state_tokens = state.split()

        # default O
        tags = ["O"] * len(toks)

        # mark district
        span = find_span(district_tokens, toks)
        if span:
            s, e = span
            tags[s] = "B-LOC"
            for idx in range(s+1, e):
                tags[idx] = "I-LOC"

        # mark state (don't overwrite if overlapping‚Äîwe keep district marking precedence)
        span2 = find_span(state_tokens, toks)
        if span2:
            s, e = span2
            # only write if those positions are still O
            if tags[s] == "O":
                tags[s] = "B-LOC"
            else:
                # if already B-LOC (rare overlap), keep it
                tags[s] = tags[s]
            for idx in range(s+1, e):
                if tags[idx] == "O":
                    tags[idx] = "I-LOC"

        synthetic_sentences.append(toks)
        synthetic_tags.append(tags)

print(f"‚úÖ Synthetic sentences generated: {len(synthetic_sentences)} (expected {len(pairs)*len(templates)})")

# --- 4) Merge data
# Note: expects variables from your previous preprocessing:
# - hindi_sentences: list[list[str]]
# - hindi_tags_normalized: list[list[str]]
# - train_sentences, valid_sentences, test_sentences: lists of token lists
# - train_tags, valid_tags, test_tags: lists of tag lists (normalized)
# The cell earlier produced conll_tags_normalized; if you have different names adjust them.

# If conll tags variables are named differently use this fallback:
try:
    conll_train_tags = train_tags
    conll_valid_tags = valid_tags
    conll_test_tags = test_tags
except NameError:
    # if raw conll tags were normalized into conll_tags_normalized and combined, split:
    try:
        raise
    except:
        pass

# Build training pool:
train_tokens = hindi_sentences + train_sentences + synthetic_sentences
train_labels = hindi_tags_normalized + train_tags + synthetic_tags

# Validation and test from CoNLL (keep original splits)
valid_tokens = valid_sentences
valid_labels = valid_tags
test_tokens = test_sentences
test_labels = test_tags

print(f"üìö Train sentences: {len(train_tokens)}")
print(f"üìö Valid sentences: {len(valid_tokens)}")
print(f"üìö Test sentences: {len(test_tokens)}")

# --- 5) Build tag set and mappings (ensure consistent ordering) ---
unique_tags = sorted({t for seq in (train_labels + valid_labels + test_labels) for t in seq})
tag2id = {t: i for i, t in enumerate(unique_tags)}
id2tag = {i: t for t, i in tag2id.items()}

print("üîñ Tags:", unique_tags)
print("üî¢ tag2id:", tag2id)

# --- 6) Tokenizer & alignment for XLM-R ---
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(examples_tokens, examples_labels):
    tokenized_inputs = tokenizer(
        examples_tokens,
        is_split_into_words=True,
        truncation=True,
        padding=False  # no padding here; HF Trainer will handle batching
    )
    all_labels = []
    for i, labels in enumerate(examples_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                # Special token
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Start of a new word
                label_ids.append(tag2id.get(labels[word_idx], tag2id.get("O")))
            else:
                # Same word, set to I-... if original tag was B-LOC, change to I-LOC for subword token
                orig_tag = labels[word_idx]
                if orig_tag == "B-LOC":
                    label_ids.append(tag2id.get("I-LOC", tag2id.get("O")))
                else:
                    label_ids.append(tag2id.get(orig_tag, tag2id.get("O")))
            previous_word_idx = word_idx
        all_labels.append(label_ids)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# --- 7) Create HF Datasets (tokenize on-the-fly) ---
print("‚ú≥ Tokenizing & aligning labels for train (this may take a bit)...")
train_tokenized = tokenize_and_align_labels(train_tokens, train_labels)
print("‚ú≥ Tokenizing & aligning labels for valid...")
valid_tokenized = tokenize_and_align_labels(valid_tokens, valid_labels)
print("‚ú≥ Tokenizing & aligning labels for test...")
test_tokenized = tokenize_and_align_labels(test_tokens, test_labels)

# Convert to datasets.Dataset
train_ds = Dataset.from_dict(train_tokenized)
valid_ds = Dataset.from_dict(valid_tokenized)
test_ds = Dataset.from_dict(test_tokenized)

dataset_dict = DatasetDict({"train": train_ds, "validation": valid_ds, "test": test_ds})

# --- 8) Save small samples and mappings ---
sample_csv = ROOT / "synthetic_sample_head.csv"
pd.DataFrame({
    "tokens": [" ".join(t) for t in train_tokens[:8]],
    "labels": [" ".join(l) for l in train_labels[:8]]
}).to_csv(sample_csv, index=False)

with open(ROOT / "tag2id.json", "w") as f:
    json.dump(tag2id, f, ensure_ascii=False, indent=2)
with open(ROOT / "id2tag.json", "w") as f:
    json.dump(id2tag, f, ensure_ascii=False, indent=2)

print("üíæ Saved sample and tag maps to Drive:", ROOT)

# --- 9) Print quick stats & sample ---
print("\n=== Final Dataset Sizes ===")
print("Train:", len(dataset_dict["train"]))
print("Validation:", len(dataset_dict["validation"]))
print("Test:", len(dataset_dict["test"]))

print("\n=== Example (tokenized) ===")
for i in range(2):
    ex = dataset_dict["train"][i]
    words = tokenizer.convert_ids_to_tokens(ex["input_ids"])
    print("\nTokens:", words)
    print("Labels:", ex["labels"])

print("\n‚úÖ ADVANCED synthetic generation + tokenization completed.")
print("Next: define TrainingArguments & Trainer (I can prepare that cell for you).")


üöÄ Starting ADVANCED synthetic generation and dataset prep...
üìç Unique district-state pairs: 719
‚úÖ Synthetic sentences generated: 3595 (expected 3595)
üìö Train sentences: 66537
üìö Valid sentences: 3466
üìö Test sentences: 3684
üîñ Tags: ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
üî¢ tag2id: {'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}
‚ú≥ Tokenizing & aligning labels for train (this may take a bit)...
‚ú≥ Tokenizing & aligning labels for valid...
‚ú≥ Tokenizing & aligning labels for test...
üíæ Saved sample and tag maps to Drive: /content/drive/MyDrive/NER_RETRAIN

=== Final Dataset Sizes ===
Train: 66537
Validation: 3466
Test: 3684

=== Example (tokenized) ===

Tokens: ['<s>', '‚ñÅTho', 'usan', 'ds', '‚ñÅof', '‚ñÅdemonstra', 'tors', '‚ñÅhave', '‚ñÅmarche', 'd', '‚ñÅthrough', '‚ñÅLondon', '‚ñÅto', '‚ñÅprotest', '‚ñÅthe', '‚ñÅwar', '‚ñÅin', '‚ñÅIraq', '‚ñÅand', '‚ñÅdemand', 

In [40]:
!pip install -U transformers datasets evaluate accelerate seqeval

# ======================================================
# FINAL TRAINING BLOCK ‚Äî MULTILINGUAL XLM-R NER MODEL
# ======================================================

from transformers import (
    XLMRobertaForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import evaluate
import numpy as np
import torch

SAVE_DIR = "/content/drive/MyDrive/NER_RETRAIN/xlmr_location_ner"

print("üìÅ Final model will be saved to:", SAVE_DIR)

# ------------------------------------------------------
# Auto-select batch size depending on GPU/CPU
# ------------------------------------------------------
if torch.cuda.is_available():
    BATCH_SIZE = 16
    FP16 = True
    print("üöÄ GPU detected ‚Äî Using batch size =", BATCH_SIZE)
else:
    BATCH_SIZE = 2
    FP16 = False
    print("‚ö†Ô∏è CPU detected ‚Äî Using batch size =", BATCH_SIZE)

# ------------------------------------------------------
# Load model backbone
# ------------------------------------------------------
model = XLMRobertaForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

data_collator = DataCollatorForTokenClassification(tokenizer)

# ------------------------------------------------------
# Define eval metric (seqeval)
# ------------------------------------------------------
seqeval = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)

    true_preds = []
    true_labels = []

    for pred, lab in zip(preds, labels):
        for p, l in zip(pred, lab):
            if l != -100:  # ignore special tokens
                true_preds.append(id2tag[p])
                true_labels.append(id2tag[l])

    # seqeval expects list of lists, so wrap in one sequence
    true_preds = [true_preds]
    true_labels = [true_labels]

    results = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

# ------------------------------------------------------
# TrainingArguments
# ------------------------------------------------------
training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=200,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    num_train_epochs=2,
    fp16=FP16,
    load_best_model_at_end=True,
    push_to_hub=False
)

# ------------------------------------------------------
# Trainer
# ------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ------------------------------------------------------
# START TRAINING
# ------------------------------------------------------
print("\nüöÄ Starting training...\n")
trainer.train()

# ------------------------------------------------------
# SAVE FINAL MODEL
# ------------------------------------------------------
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("\nüéâ Training complete!")
print("üíæ Final model saved at:", SAVE_DIR)


üìÅ Final model will be saved to: /content/drive/MyDrive/NER_RETRAIN/xlmr_location_ner
üöÄ GPU detected ‚Äî Using batch size = 16


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



üöÄ Starting training...



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
import os
from getpass import getpass

# This will NOT echo the key when you paste it
os.environ["GROQ_API_KEY"] = getpass("Paste your new GROQ API key: ")


Paste your new GROQ API key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


In [3]:
!pip install -q groq


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/137.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ[0m [32m133.1/137.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m137.3/137.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
from groq import Groq

client = Groq(api_key=os.environ["GROQ_API_KEY"])


In [7]:
import torch
import json

MODEL_PATH = "/content/drive/MyDrive/NER_RETRAIN/xlmr_location_ner"

tokenizer = XLMRobertaTokenizerFast.from_pretrained(MODEL_PATH)
model = XLMRobertaForTokenClassification.from_pretrained(MODEL_PATH)
model.eval()
model.to(DEVICE) # Move the model to the correct device

with open("/content/drive/MyDrive/NER_RETRAIN/id2tag.json") as f:
    id2tag = {int(k):v for k,v in json.load(f).items()}

def extract_locations(text):
    toks = tokenizer(text, return_tensors="pt")
    toks = {k: v.to(DEVICE) for k, v in toks.items()} # Move input tensors to the correct device
    with torch.no_grad():
        logits = model(**toks).logits[0]

    preds = torch.argmax(logits, dim=-1).tolist()
    tokens = tokenizer.convert_ids_to_tokens(toks["input_ids"][0])

    locations = []
    buf = ""

    for tok, p in zip(tokens, preds):
        label = id2tag[p]
        tok = tok.replace(" ", "")

        if label == "B-LOC":
            if buf:
                locations.append(buf)
            buf = tok
        elif label == "I-LOC":
            buf += " " + tok
        else:
            if buf:
                locations.append(buf)
                buf = ""

    if buf:
        locations.append(buf)

    return list(set(locations))

NameError: name 'XLMRobertaTokenizerFast' is not defined

In [49]:
def analyze_incident_with_llm(text, locations):
    prompt = f"""
You are an AI incident-analysis engine for Indian news.

Given a news text and a list of detected locations, extract:

1. classification: "GOOD", "BAD", or "NEUTRAL"
2. event_type: (accident, crime, political, natural disaster, achievement, other)
3. severity: low / medium / high
4. deaths: number of deaths if mentioned
5. injured: number injured
6. main_incident_location: which location is the event-site
7. other_locations: everything else
8. summary: one sentence summary

TEXT: {text}
LOCATIONS: {locations}

Return ONLY valid JSON.
"""

    response = client.chat.completions.create(
        model="llama-3.1-8b-instant", # Changed to a valid model
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1
    )

    return response.choices[0].message.content

In [50]:
india_df = pd.read_excel("/content/drive/MyDrive/NER_RETRAIN/India District (1).xlsx")

def lookup_location_info(location):
    # Try district match
    row = india_df[india_df["District Name"].str.lower() == location.lower()]
    if not row.empty:
        r = row.iloc[0]
        return {
            "district": r["District Name"],
            "state": r["State Name"],
            "district_code": int(r["District Code"]),
            "state_code": int(r["State Code"])
        }

    # Try state match
    row = india_df[india_df["State Name"].str.lower() == location.lower()]
    if not row.empty:
        r = row.iloc[0]
        return {
            "district": None,
            "state": r["State Name"],
            "district_code": None,
            "state_code": int(r["State Code"])
        }

    return None


In [51]:
import re
import json

def extract_json(text):
    """
    Extract the first valid JSON object from any LLM output.
    """

    # Find JSON block with regex
    match = re.search(r"\{[\s\S]*\}", text)

    if match:
        json_str = match.group(0)

        # Try to load JSON
        try:
            return json.loads(json_str)
        except:
            pass

    # If still failing, try fixing common issues:
    text = text.strip()

    # Remove markdown fences
    text = text.replace("```json", "").replace("```", "")

    # Attempt direct load
    try:
        return json.loads(text)
    except:
        raise ValueError("‚ùå Could not parse JSON from LLM output:\n" + text)


def extract_full_incident(text):
    # Step 1: Location detection
    locations = extract_locations(text)

    # Step 2: LLM incident analysis
    llm_json = analyze_incident_with_llm(text, locations)

    # Step 3: Robust JSON extraction
    data = extract_json(llm_json)

    # Step 4: Excel mapping for main incident location
    main_loc = data.get("main_incident_location")
    if main_loc:
        data["location_details"] = lookup_location_info(main_loc)

    return data

In [52]:
text = "Chennai and Bangalore have a water war and 3 people died in Dindigul."

result = extract_full_incident(text)
result


{'classification': 'BAD',
 'event_type': 'accident',
 'severity': 'high',
 'deaths': 3,
 'injured': None,
 'main_incident_location': 'Dindigul',
 'other_locations': ['Chennai', 'Bangalore'],
 'summary': 'A water war between Chennai and Bangalore resulted in 3 deaths in Dindigul.',
 'location_details': {'district': 'Dindigul',
  'state': 'TAMIL NADU',
  'district_code': 612,
  'state_code': 33}}

In [53]:
text = "After weeks of tension between farmers in Erode and textile groups in Tiruppur over shared river access, a violent clash broke out on Thursday evening. While the dispute began near Kodumudi, the most serious incident occurred 40 km away in Karur district, where one protester succumbed to injuries sustained during stone-pelting. Police from Coimbatore were rushed to both locations, and the state government said the situation was now under control."

result = extract_full_incident(text)
result

{'classification': 'BAD',
 'event_type': 'accident',
 'severity': 'high',
 'deaths': 1,
 'injured': None,
 'main_incident_location': 'Karur',
 'other_locations': 'Erode, Tiruppur, Kodumudi, Coimbatore',
 'summary': "A violent clash broke out between farmers and textile groups in Karur district, resulting in one protester's death.",
 'location_details': {'district': 'Karur',
  'state': 'TAMIL NADU',
  'district_code': 613,
  'state_code': 33}}

In [54]:
text = "Heavy rains that started in Chikkamagaluru triggered flooding downstream in Shivamogga district, but the worst damage was reported in Davanagere where two houses collapsed early Monday morning. Although no deaths were reported, four people suffered serious injuries. Teams from Bengaluru were deployed to assist, while authorities in Hassan remained on alert for possible landslides."

result = extract_full_incident(text)
result

{'classification': 'BAD',
 'event_type': 'natural disaster',
 'severity': 'high',
 'deaths': 0,
 'injured': 4,
 'main_incident_location': 'Davanagere',
 'other_locations': ['Chikkamagaluru', 'Shivamogga', 'Bengaluru', 'Hassan'],
 'summary': 'Heavy rains triggered flooding and house collapses in Davanagere, with no reported deaths but four serious injuries.',
 'location_details': {'district': 'Davanagere',
  'state': 'KARNATAKA',
  'district_code': 567,
  'state_code': 29}}

In [55]:
text = "A businessman from Hyderabad travelling to Visakhapatnam was allegedly kidnapped near Vijayawada but was later found safe in Guntur after a police chase that extended into Ongole. Early reports mistakenly claimed the abduction happened in Hyderabad, but officials later clarified that the crime occurred near Nandigama toll plaza."

result = extract_full_incident(text)
result

{'classification': 'BAD',
 'event_type': 'crime',
 'severity': 'high',
 'deaths': 0,
 'injured': 0,
 'main_incident_location': 'Nandigama toll plaza',
 'other_locations': ['Visakhapatnam',
  'Ongole',
  'Vijayawada',
  'Guntur',
  'Hyderabad'],
 'summary': 'A businessman was kidnapped near Vijayawada but was later found safe in Guntur after a police chase.',
 'location_details': None}

In [56]:
text = "In a significant achievement, students from Kochi won a national robotics competition in New Delhi on the same day a tragic boat accident in Alappuzha claimed the lives of two tourists. Authorities said rescue operations were quick, preventing further casualties."

result = extract_full_incident(text)
result

{'classification': 'GOOD',
 'event_type': 'achievement',
 'severity': 'low',
 'deaths': 2,
 'injured': 0,
 'main_incident_location': 'Alappuzha',
 'other_locations': ['Kochi', 'New Delhi'],
 'summary': 'Students from Kochi won a national robotics competition in New Delhi, while a tragic boat accident in Alappuzha claimed two lives.',
 'location_details': {'district': 'Alappuzha',
  'state': 'KERALA',
  'district_code': 598,
  'state_code': 32}}