In [2]:
#cell1
# If you're in Colab, this cell is all you need for installs.
# If you're on local/other envs, keep the versions pinned.

!pip -q uninstall -y transformers tokenizers simpletransformers sentencepiece seqeval || true

# CPU Torch (works on Colab and most Linux/macOS/Win with Py3.12)
!pip -q install --upgrade pip
!pip -q install "torch>=2.3,<2.5" --index-url https://download.pytorch.org/whl/cpu

# Libraries with wheels for Python 3.12 that work well together
!pip -q install simpletransformers==0.64.3 \
                transformers==4.37.2 \
                tokenizers==0.15.2 \
                sentencepiece==0.1.99 \
                pandas==2.2.2 \
                scikit-learn==1.5.1 \
                seqeval==1.2.2

# Some older libs expect this alias
import collections, collections.abc
collections.Iterable = collections.abc.Iterable

import transformers, simpletransformers
from importlib.metadata import version
print("transformers:", transformers.__version__)
print("simpletransformers:", version("simpletransformers"))


[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchtune 0.6.1 requires sentencepiece, which is not installed.
torchtune 0.6.1 requires tokenizers, which is not installed.
sentence-transformers 5.1.0 requires transformers<5.0.0,>=4.41.0, which is not installed.
peft 0.17.1 requires transformers, which is not installed.
torchvision 0.23.0+cu126 requires torch==2.8.0, but you have torch 2.4.1+cpu which is incompatible.
torchaudio 2.8.0+cu126 requires torch==2.8.0, but you have torch 2.4.1+cpu which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.37.2 which is incompatible.
umap-learn 0.5.9.post2 requir

In [3]:
#cell2
import os, requests

# Prefer your uploaded file (adjust paths to where your data lives when running)
CANDIDATE_PATHS = ["/content/telugu_sentences.txt"]

# Optional Google Doc fallback (make sure it's shared as "Anyone with the link: Viewer")
DOC_ID = "11bLuNA7yHQQOXWWBzYxzQGqH_zl4dE2YwnaOm6e0pwM"
DOC_TXT_URL = f"https://docs.google.com/document/d/{DOC_ID}/export?format=txt"

def load_telugu_text():
    for p in CANDIDATE_PATHS:
        if os.path.exists(p):
            print(f"Loaded dataset from: {p}")
            return open(p, encoding="utf-8").read()
    try:
        r = requests.get(DOC_TXT_URL, timeout=15)
        if r.status_code == 200 and r.text.strip():
            print("Loaded dataset via Google Doc TXT export.")
            return r.text
    except Exception:
        pass
    print("Using small built-in fallback sample (paste your full data in a file for real runs).")
    return """రాముడు - సీతను - రక్షించాడు.
భారతదేశ చరిత్ర - చాలా గొప్పది.
నాకు - పుస్తకాలు చదవడం - ఇష్టం.
"""

raw_text = load_telugu_text()
lines = [ln.strip() for ln in raw_text.splitlines() if ln.strip()]
print("Example lines:", lines[:5])
print("Total lines:", len(lines))


Loaded dataset from: /content/telugu_sentences.txt
Example lines: ['\ufeffరాముడు - సీతను - రక్షించాడు.', 'భారతదేశ చరిత్ర - చాలా గొప్పది.', 'నాకు - పుస్తకాలు చదవడం - ఇష్టం.', 'అతను - ప్రతిరోజూ ఉదయం - యోగా - చేస్తాడు.', 'పిల్లలు - పార్కులో - ఆడుతారు.']
Total lines: 3115


In [5]:
#cell3
import re
import unicodedata

# OPTIONAL: set to True if you want punctuation as separate tokens (cleaner metrics)
DETACH_PUNCT = True
# Telugu + common punctuation you may see
END_PUNCT = "।.!?…"

def strip_invisibles(s: str) -> str:
    # remove BOM and other zero-width marks
    s = s.replace("\ufeff", "")
    s = "".join(ch for ch in s if unicodedata.category(ch) not in {"Cf"})  # format chars
    return s

def normalize_hyphens(s: str) -> str:
    s = s.replace("–", "-").replace("—", "-")
    s = re.sub(r"\s*-\s*", " - ", s)        # make '-' a standalone token
    s = re.sub(r"\s+", " ", s).strip()
    return s

def attach_hyphens(s: str) -> str:
    toks, out = s.split(), []
    for t in toks:
        if t == "-" and out:
            out[-1] = out[-1] + "-"
        else:
            out.append(t)
    return " ".join(out)

def detach_end_punct(s: str) -> str:
    if not DETACH_PUNCT:
        return s
    toks, out = s.split(), []
    for t in toks:
        # move a single trailing end-punct to its own token
        if len(t) > 1 and t[-1] in END_PUNCT:
            out.append(t[:-1])
            out.append(t[-1])
        else:
            out.append(t)
    return " ".join(out)

# Build sentences (stable de-dup + full normalization)
seen, sentences = set(), []
for ln in raw_text.splitlines():
    ln = ln.strip()
    if not ln:
        continue
    ln = strip_invisibles(ln)
    ln = normalize_hyphens(ln)
    ln = attach_hyphens(ln)
    ln = detach_end_punct(ln)
    if ln and ln not in seen:
        seen.add(ln)
        sentences.append(ln)

print("Total sentences after preprocess:", len(sentences))
print(sentences[:5])


Total sentences after preprocess: 2209
['రాముడు- సీతను- రక్షించాడు .', 'భారతదేశ చరిత్ర- చాలా గొప్పది .', 'నాకు- పుస్తకాలు చదవడం- ఇష్టం .', 'అతను- ప్రతిరోజూ ఉదయం- యోగా- చేస్తాడు .', 'పిల్లలు- పార్కులో- ఆడుతారు .']


In [6]:
#cell4
import pandas as pd
from sklearn.model_selection import train_test_split

# sentence-wise split
train_s, test_s = train_test_split(sentences, test_size=0.20, random_state=42, shuffle=True)
train_s, val_s  = train_test_split(train_s,  test_size=0.10, random_state=42, shuffle=True)  # 10% of train for val

def make_ner_df(sent_list, labeled=True):
    rows = []
    for sid, s in enumerate(sent_list):
        for tok in s.split():
            if labeled:
                rows.append({
                    "sentence_id": sid,
                    "words": tok[:-1] if tok.endswith("-") else tok,
                    "labels": "HYPHEN" if tok.endswith("-") else "NO-HYPHEN"
                })
            else:
                rows.append({"sentence_id": sid, "words": tok.replace("-", "")})
    return pd.DataFrame(rows)

train_df       = make_ner_df(train_s, labeled=True)
val_df         = make_ner_df(val_s,   labeled=True)
test_df_lbl    = make_ner_df(test_s,  labeled=True)   # ground truth
test_df_unlbl  = make_ner_df(test_s,  labeled=False)  # inputs

# optional: persist
train_df.to_csv("train_data.csv", index=False)
val_df.to_csv("val_data.csv", index=False)
test_df_lbl.to_csv("test_data_labelled.csv", index=False)
test_df_unlbl.to_csv("test_data.csv", index=False)

len(train_s), len(val_s), len(test_s), len(train_df), len(test_df_unlbl)


(1590, 177, 442, 8321, 2314)

In [10]:
#cell5
import torch, gc
from simpletransformers.ner import NERModel, NERArgs
from collections import defaultdict
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix

USE_CUDA = torch.cuda.is_available()
LABELS = ["HYPHEN", "NO-HYPHEN"]

def low_mem_args(seed: int = 42):
    a = NERArgs()
    a.num_train_epochs = 3
    a.train_batch_size = 2
    a.eval_batch_size  = 2
    a.max_seq_length   = 32
    a.gradient_accumulation_steps = 2
    a.learning_rate = 4e-5
    a.warmup_ratio  = 0.1
    a.overwrite_output_dir = True
    a.labels_list = LABELS
    a.save_model_every_epoch = False
    a.reprocess_input_data = True
    a.use_multiprocessing = False
    a.use_multiprocessing_for_evaluation = False
    a.evaluate_during_training = True
    a.early_stopping_patience = 2
    a.early_stopping_metric = "f1_score"
    a.early_stopping_metric_minimize = False
    a.fp16 = False  # keep False on CPU
    a.manual_seed = seed
    return a

def cleanup(model=None, *objs):
    if model is not None: del model
    for o in objs:
        try: del o
        except: pass
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def sentences_from_df(df):
    return df.groupby("sentence_id", sort=False)["words"].apply(lambda x: " ".join(map(str, x))).tolist()

def flatten_preds(predictions):
    return [list(d.values())[0] for sent in predictions for d in sent]

def add_pred_col(df_unlbl, labels, colname):
    assert len(df_unlbl) == len(labels)
    out = df_unlbl.copy()
    out[colname] = labels
    return out

def reconstruct(df_with_pred, pred_col):
    sents = defaultdict(list)
    for _, row in df_with_pred.iterrows():
        sid, tok, lab = row["sentence_id"], str(row["words"]), row[pred_col]
        sents[sid].append(tok + "-" if lab == "HYPHEN" else tok)
    return [" ".join(sents[k]) for k in sents.keys()]

def report(df_pred_true, pred_col):
    y_true = df_pred_true["labels"].tolist()
    y_pred = df_pred_true[pred_col].tolist()
    token_acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=["HYPHEN"], average="macro", zero_division=0
    )
    print(f"[{pred_col}] Token Acc: {token_acc:.4f} | HYPHEN P/R/F1: {p:.4f}/{r:.4f}/{f1:.4f}")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))


In [11]:
#cell6
ratio_hyphen = (train_df["labels"]=="HYPHEN").mean()
print(f"HYPHEN ratio in train tokens: {ratio_hyphen:.3f}")


HYPHEN ratio in train tokens: 0.467


In [12]:
#cell7
from simpletransformers.ner import NERModel

mbert_args = low_mem_args()
mbert_args.output_dir = "mbert_te_output"

mbert = NERModel(
    "bert", "bert-base-multilingual-cased",
    args=mbert_args.__dict__,
    use_cuda=USE_CUDA
)
mbert.train_model(train_df, eval_data=val_df)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1590 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]

(1191,
 defaultdict(list,
             {'global_step': [397, 794, 1191],
              'train_loss': [0.41220855712890625,
               0.11649908870458603,
               0.03575238958001137],
              'eval_loss': [0.3120755636959933,
               0.2837889921574152,
               0.29706761108847396],
              'precision': [np.float64(0.7060702875399361),
               np.float64(0.7213114754098361),
               np.float64(0.7161716171617162)],
              'recall': [np.float64(0.7478849407783418),
               np.float64(0.7445008460236887),
               np.float64(0.7343485617597293)],
              'f1_score': [np.float64(0.7263763352506164),
               np.float64(0.7327227310574522),
               np.float64(0.7251461988304094)]}))

In [24]:
#cell8
# --- Robust tokenized predictor: guarantees 1:1 labels per token ---

import unicodedata

END_PUNCT = {"।", ".", "!", "?", "…"}
EXTRA_DROP = {
    ".", "।", "!", "?", "…", ",", "؛", ":", ";",
    "—", "–", "-", "―",
    '"', "'", "“", "”", "‘", "’",
    "(", ")", "[", "]", "{", "}", "<", ">", "|", "/", "\\"
}

def is_noise_token(tok: str) -> bool:
    tok = str(tok).strip()
    if tok == "" or tok in END_PUNCT or tok in EXTRA_DROP:
        return True
    cats = {unicodedata.category(ch) for ch in tok}
    # All punctuation/symbols → drop
    return bool(cats) and all(c[0] in ("P", "S") for c in cats)

def filter_noise_df(df):
    keep_rows = []
    for sid, grp in df.groupby("sentence_id", sort=False):
        for idx, w in zip(grp.index.tolist(), grp["words"].tolist()):
            if not is_noise_token(w):
                keep_rows.append(idx)
    return df.loc[keep_rows].reset_index(drop=True), keep_rows

def wordlists_from_df(df):
    groups = df.groupby("sentence_id", sort=False)["words"].apply(list).tolist()
    return [[str(w).strip() for w in lst if str(w).strip() != ""] for lst in groups]

def predict_flat_labels_tokenized_robust(model, df_unlbl, repair_mode="pad_no_hyphen", verbose_first_mismatch=True):
    """
    Uses split_on_space=False (exact tokens). If a sentence gets fewer/more labels than tokens,
    repairs it per 'repair_mode':
      - 'pad_no_hyphen'  -> pad missing labels with 'NO-HYPHEN'
      - 'repeat_last'    -> pad with last predicted label
      - 'trim_extra'     -> trim extra labels if any (kept regardless of mode)
    Returns a flat label list aligned with df_unlbl.
    """
    wlists = wordlists_from_df(df_unlbl)
    preds, _ = model.predict(wlists, split_on_space=False)

    fixed = []
    first_bad_printed = False
    for i, (tokens, pred_sent) in enumerate(zip(wlists, preds)):
        labels = [list(d.values())[0] for d in pred_sent]
        lt, lp = len(tokens), len(labels)
        if lt != lp:
            if verbose_first_mismatch and not first_bad_printed:
                print(f"[repair] mismatch at sent {i}: tokens={lt}, preds={lp}")
                print("TOKENS:", tokens)
                print("PRED_KEYS:", [list(d.keys())[0] for d in pred_sent])
                first_bad_printed = True
            # Trim extras if any
            if lp > lt:
                labels = labels[:lt]
                lp = lt
            # Pad missings if any
            if lp < lt:
                deficit = lt - lp
                if repair_mode == "repeat_last" and labels:
                    pad_label = labels[-1]
                else:
                    pad_label = "NO-HYPHEN"
                labels = labels + [pad_label] * deficit
        fixed.append(labels)

    flat = [lab for labs in fixed for lab in labs]
    assert len(flat) == len(df_unlbl), f"Flat preds {len(flat)} != df rows {len(df_unlbl)} after repair"
    return flat


In [25]:
#cell9
import pandas as pd

# Load original test frames
test_unlbl_orig = pd.read_csv("test_data.csv")
test_lbl_orig   = pd.read_csv("test_data_labelled.csv")

# (A) Filter punctuation/symbol-only tokens ONCE so all models evaluate identically
test_unlbl, kept_idx = filter_noise_df(test_unlbl_orig)
test_lbl             = test_lbl_orig.loc[kept_idx].reset_index(drop=True)

# (B) Predict with robust tokenized helper (guaranteed 1:1)
mbert_flat = predict_flat_labels_tokenized_robust(mbert, test_unlbl, repair_mode="pad_no_hyphen")

# (C) Attach predictions & evaluate
mbert_df = test_unlbl.copy()
mbert_df["mbert_pred"] = mbert_flat

mbert_eval = mbert_df.copy()
mbert_eval["labels"] = test_lbl["labels"].values
report(mbert_eval, "mbert_pred")

# (D) Sentence-level exact match on the filtered tokens
mbert_sent_pred = reconstruct(mbert_df, "mbert_pred")
true_sentences  = reconstruct(test_lbl.rename(columns={"labels":"mbert_pred"}), "mbert_pred")
mbert_sent_acc = sum(p==t for p,t in zip(mbert_sent_pred, true_sentences)) / len(true_sentences)
print(f"[mBERT] Sentence-level exact match (filtered): {mbert_sent_acc:.4f}")


  0%|          | 0/442 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/221 [00:00<?, ?it/s]

[repair] mismatch at sent 394: tokens=7, preds=6
TOKENS: ['ఉదయం', 'తేలికపాటి', 'సూర్యరశ్మి', 'శరీరానికి', 'చాలా', 'ప్రయోజనకరంగా', 'ఉంటుంది']
PRED_KEYS: ['ఉదయం', 'తేలికపాటి', 'సూర్యరశ్మి', 'శరీరానికి', 'చాలా', 'ప్రయోజనకరంగా']
[mbert_pred] Token Acc: 0.9056 | HYPHEN P/R/F1: 0.9068/0.9134/0.9101
              precision    recall  f1-score   support

      HYPHEN     0.9068    0.9134    0.9101      1086
   NO-HYPHEN     0.9043    0.8970    0.9006       990

    accuracy                         0.9056      2076
   macro avg     0.9055    0.9052    0.9054      2076
weighted avg     0.9056    0.9056    0.9056      2076

[mBERT] Sentence-level exact match (filtered): 0.6380


In [26]:
cleanup(mbert)   # free RAM/VRAM


In [27]:
indic_args = low_mem_args()
indic_args.output_dir = "indicbert_te_output"

indic = NERModel(
    "albert", "ai4bharat/indic-bert",
    args=indic_args.__dict__,
    use_cuda=USE_CUDA
)
indic.train_model(train_df, eval_data=val_df)




config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

  0%|          | 0/1590 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]

(1191,
 defaultdict(list,
             {'global_step': [397, 794, 1191],
              'train_loss': [0.4829985797405243,
               0.06896091252565384,
               0.2569805681705475],
              'eval_loss': [0.2956727788773146,
               0.2587251200372081,
               0.28417483379206293],
              'precision': [np.float64(0.7),
               np.float64(0.7022653721682848),
               np.float64(0.7147385103011094)],
              'recall': [np.float64(0.7343485617597293),
               np.float64(0.7343485617597293),
               np.float64(0.7631133671742809)],
              'f1_score': [np.float64(0.7167630057803468),
               np.float64(0.7179487179487181),
               np.float64(0.7381342062193126)]}))

In [29]:
# === RESYNC EVAL STATE (run this once if you see NameError on test_unlbl_filt etc.) ===
import pandas as pd, unicodedata, gc, torch
from collections import defaultdict
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

# --- noise filters (standalone punctuation/symbol tokens) ---
END_PUNCT = {"।", ".", "!", "?", "…"}
EXTRA_DROP = {
    ".", "।", "!", "?", "…", ",", "؛", ":", ";",
    "—", "–", "-", "―",
    '"', "'", "“", "”", "‘", "’",
    "(", ")", "[", "]", "{", "}", "<", ">", "|", "/", "\\"
}

def is_noise_token(tok: str) -> bool:
    tok = str(tok).strip()
    if tok == "" or tok in END_PUNCT or tok in EXTRA_DROP:
        return True
    cats = {unicodedata.category(ch) for ch in tok}
    return bool(cats) and all(c[0] in ("P", "S") for c in cats)

def filter_noise_df(df):
    keep_rows = []
    for sid, grp in df.groupby("sentence_id", sort=False):
        for idx, w in zip(grp.index.tolist(), grp["words"].tolist()):
            if not is_noise_token(w):
                keep_rows.append(idx)
    return df.loc[keep_rows].reset_index(drop=True), keep_rows

def wordlists_from_df(df):
    groups = df.groupby("sentence_id", sort=False)["words"].apply(list).tolist()
    return [[str(w).strip() for w in lst if str(w).strip() != ""] for lst in groups]

# robust predictor that guarantees 1:1 labels per token
def predict_flat_labels_tokenized_robust(model, df_unlbl, repair_mode="pad_no_hyphen", verbose_first_mismatch=True):
    wlists = wordlists_from_df(df_unlbl)
    preds, _ = model.predict(wlists, split_on_space=False)
    fixed = []
    first_bad_printed = False
    for i, (tokens, pred_sent) in enumerate(zip(wlists, preds)):
        labels = [list(d.values())[0] for d in pred_sent]
        lt, lp = len(tokens), len(labels)
        if lt != lp:
            if verbose_first_mismatch and not first_bad_printed:
                print(f"[repair] mismatch at sent {i}: tokens={lt}, preds={lp}")
                print("TOKENS:", tokens)
                print("PRED_KEYS:", [list(d.keys())[0] for d in pred_sent])
                first_bad_printed = True
            if lp > lt:
                labels = labels[:lt]
                lp = lt
            if lp < lt:
                deficit = lt - lp
                pad_label = labels[-1] if (repair_mode == "repeat_last" and labels) else "NO-HYPHEN"
                labels = labels + [pad_label] * deficit
        fixed.append(labels)
    flat = [lab for labs in fixed for lab in labs]
    assert len(flat) == len(df_unlbl), f"Flat preds {len(flat)} != df rows {len(df_unlbl)} after repair"
    return flat

def reconstruct(df_with_pred, pred_col):
    sents = defaultdict(list)
    for _, row in df_with_pred.iterrows():
        sid, tok, lab = row["sentence_id"], str(row["words"]), row[pred_col]
        sents[sid].append(tok + "-" if lab == "HYPHEN" else tok)
    return [" ".join(sents[k]) for k in sents.keys()]

def report(df_pred_true, pred_col):
    y_true = df_pred_true["labels"].tolist()
    y_pred = df_pred_true[pred_col].tolist()
    token_acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=["HYPHEN"], average="macro", zero_division=0
    )
    print(f"[{pred_col}] Token Acc: {token_acc:.4f} | HYPHEN P/R/F1: {p:.4f}/{r:.4f}/{f1:.4f}")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

# --- reload original CSVs and rebuild filtered eval frames ---
test_unlbl_orig = pd.read_csv("test_data.csv")
test_lbl_orig   = pd.read_csv("test_data_labelled.csv")

test_unlbl_filt, kept_idx = filter_noise_df(test_unlbl_orig)
test_lbl_filt             = test_lbl_orig.loc[kept_idx].reset_index(drop=True)

# true sentences (from gold labels) for sentence-level metric
true_sentences = reconstruct(test_lbl_filt.rename(columns={"labels":"gold"}), "gold")

print(f"Ready ✅  tokens={len(test_unlbl_filt)}  sentences={test_unlbl_filt['sentence_id'].nunique()}")


Ready ✅  tokens=2076  sentences=442


In [30]:
indic_flat = predict_flat_labels_tokenized_robust(indic, test_unlbl_filt, repair_mode="pad_no_hyphen")

indic_df = test_unlbl_filt.copy()
indic_df["indic_pred"] = indic_flat

indic_eval = indic_df.copy()
indic_eval["labels"] = test_lbl_filt["labels"].values
report(indic_eval, "indic_pred")

indic_sent_pred = reconstruct(indic_df, "indic_pred")
indic_sent_acc = sum(p==t for p,t in zip(indic_sent_pred, true_sentences)) / len(true_sentences)
print(f"[IndicBERT] Sentence-level exact match (filtered): {indic_sent_acc:.4f}")


  0%|          | 0/442 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/221 [00:00<?, ?it/s]

[indic_pred] Token Acc: 0.9056 | HYPHEN P/R/F1: 0.9002/0.9217/0.9108
              precision    recall  f1-score   support

      HYPHEN     0.9002    0.9217    0.9108      1086
   NO-HYPHEN     0.9118    0.8879    0.8997       990

    accuracy                         0.9056      2076
   macro avg     0.9060    0.9048    0.9053      2076
weighted avg     0.9057    0.9056    0.9055      2076

[IndicBERT] Sentence-level exact match (filtered): 0.6471


In [31]:
cleanup(indic)


In [32]:
bertc_args = low_mem_args()
bertc_args.output_dir = "bert_cased_te_output"

bertc = NERModel(
    "bert", "bert-base-cased",
    args=bertc_args.__dict__,
    use_cuda=USE_CUDA
)
bertc.train_model(train_df, eval_data=val_df)




config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

  0%|          | 0/1590 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]

(1191,
 defaultdict(list,
             {'global_step': [397, 794, 1191],
              'train_loss': [0.4241238832473755,
               0.29971620440483093,
               0.30058836936950684],
              'eval_loss': [0.4238242356294996,
               0.4201585896564334,
               0.4174613272875882],
              'precision': [np.float64(0.5),
               np.float64(0.5121457489878543),
               np.float64(0.5238095238095238)],
              'recall': [np.float64(0.5685279187817259),
               np.float64(0.428087986463621),
               np.float64(0.4653130287648054)],
              'f1_score': [np.float64(0.5320665083135393),
               np.float64(0.4663594470046083),
               np.float64(0.492831541218638)]}))

In [33]:
bertc_flat = predict_flat_labels_tokenized_robust(bertc, test_unlbl_filt, repair_mode="pad_no_hyphen")

bertc_df = test_unlbl_filt.copy()
bertc_df["bert_pred"] = bertc_flat

bertc_eval = bertc_df.copy()
bertc_eval["labels"] = test_lbl_filt["labels"].values
report(bertc_eval, "bert_pred")

bertc_sent_pred = reconstruct(bertc_df, "bert_pred")
bertc_sent_acc = sum(p==t for p,t in zip(bertc_sent_pred, true_sentences)) / len(true_sentences)
print(f"[BERT-cased] Sentence-level exact match (filtered): {bertc_sent_acc:.4f}")


  0%|          | 0/442 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/221 [00:00<?, ?it/s]

[bert_pred] Token Acc: 0.7558 | HYPHEN P/R/F1: 0.7232/0.8637/0.7872
              precision    recall  f1-score   support

      HYPHEN     0.7232    0.8637    0.7872      1086
   NO-HYPHEN     0.8100    0.6374    0.7134       990

    accuracy                         0.7558      2076
   macro avg     0.7666    0.7505    0.7503      2076
weighted avg     0.7646    0.7558    0.7520      2076

[BERT-cased] Sentence-level exact match (filtered): 0.3371


In [34]:
cleanup(bertc)


In [35]:
xlmr_args = low_mem_args()
xlmr_args.output_dir = "xlmr_te_output"

xlmr = NERModel(
    "xlmroberta", "xlm-roberta-base",
    args=xlmr_args.__dict__,
    use_cuda=USE_CUDA
)
xlmr.train_model(train_df, eval_data=val_df)




config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

  0%|          | 0/1590 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/795 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/89 [00:00<?, ?it/s]

(1191,
 defaultdict(list,
             {'global_step': [397, 794, 1191],
              'train_loss': [0.33044350147247314,
               0.07952063530683517,
               0.01659347489476204],
              'eval_loss': [0.27022324033667533,
               0.2581298170674048,
               0.27810846574557424],
              'precision': [np.float64(0.7154605263157895),
               np.float64(0.7192118226600985),
               np.float64(0.7217675941080196)],
              'recall': [np.float64(0.7360406091370558),
               np.float64(0.7411167512690355),
               np.float64(0.7461928934010152)],
              'f1_score': [np.float64(0.725604670558799),
               np.float64(0.73),
               np.float64(0.7337770382695508)]}))

In [36]:
xlmr_flat = predict_flat_labels_tokenized_robust(xlmr, test_unlbl_filt, repair_mode="pad_no_hyphen")

xlmr_df = test_unlbl_filt.copy()
xlmr_df["xlmr_pred"] = xlmr_flat

xlmr_eval = xlmr_df.copy()
xlmr_eval["labels"] = test_lbl_filt["labels"].values
report(xlmr_eval, "xlmr_pred")

xlmr_sent_pred = reconstruct(xlmr_df, "xlmr_pred")
xlmr_sent_acc = sum(p==t for p,t in zip(xlmr_sent_pred, true_sentences)) / len(true_sentences)
print(f"[XLM-R] Sentence-level exact match (filtered): {xlmr_sent_acc:.4f}")


  0%|          | 0/442 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/221 [00:00<?, ?it/s]

[xlmr_pred] Token Acc: 0.9070 | HYPHEN P/R/F1: 0.9161/0.9052/0.9106
              precision    recall  f1-score   support

      HYPHEN     0.9161    0.9052    0.9106      1086
   NO-HYPHEN     0.8973    0.9091    0.9032       990

    accuracy                         0.9070      2076
   macro avg     0.9067    0.9071    0.9069      2076
weighted avg     0.9072    0.9070    0.9071      2076

[XLM-R] Sentence-level exact match (filtered): 0.6493


In [37]:
cleanup(xlmr)


In [38]:
def show_errors(name, pred_sents, true_sents, max_show=15):
    shown = 0
    for i, (p, t) in enumerate(zip(pred_sents, true_sents)):
        if p != t:
            print(f"[{name}] Sent {i}")
            print("PRED:", p)
            print("TRUE:", t)
            print("-"*60)
            shown += 1
            if shown >= max_show:
                break

print("Diffs (if any):")
show_errors("mBERT",     mbert_sent_pred,  true_sentences)
show_errors("IndicBERT", indic_sent_pred,  true_sentences)
show_errors("BERT",      bertc_sent_pred,  true_sentences)
show_errors("XLM-R",     xlmr_sent_pred,   true_sentences)


Diffs (if any):
[mBERT] Sent 3
PRED: కొత్త విషయాలు- నేర్చుకోవడం- నాకు- చాలా ఇష్టం
TRUE: కొత్త విషయాలు- నేర్చుకోవడం- నాకు- చాలా- ఇష్టం
------------------------------------------------------------
[mBERT] Sent 6
PRED: పిల్లల సంరక్షణ- చాలా ముఖ్యం
TRUE: పిల్లల సంరక్షణ- చాలా- ముఖ్యం
------------------------------------------------------------
[mBERT] Sent 7
PRED: మేము- కొత్త పెంపుడు- జంతువును- తీసుకున్నాం
TRUE: మేము- కొత్త- పెంపుడు జంతువును- తీసుకున్నాం
------------------------------------------------------------
[mBERT] Sent 8
PRED: నేను- నా స్నేహితులను- చూడటానికి- వెళ్తాను
TRUE: నేను- నా- స్నేహితులను- చూడటానికి- వెళ్తాను
------------------------------------------------------------
[mBERT] Sent 10
PRED: అతను- తన తల్లిదండ్రులకు- సేవ- చేస్తాడు
TRUE: అతను- తన తల్లిదండ్రులకు- సేవ చేస్తాడు
------------------------------------------------------------
[mBERT] Sent 19
PRED: వేసవిలో- చెరకు రసం- తాగడం- చాలా- మంచిది
TRUE: వేసవిలో- చెరకు రసం తాగడం- చాలా మంచిది
-----------------------------------------

In [39]:
combo = test_unlbl_filt.copy()
combo["true_label"]  = test_lbl_filt["labels"].values
combo["mbert_pred"]  = mbert_df["mbert_pred"].values
combo["indic_pred"]  = indic_df["indic_pred"].values
combo["bert_pred"]   = bertc_df["bert_pred"].values
combo["xlmr_pred"]   = xlmr_df["xlmr_pred"].values
combo.to_csv("test_predictions_combined_telugu_filtered.csv", index=False)
print("Saved -> test_predictions_combined_telugu_filtered.csv")


Saved -> test_predictions_combined_telugu_filtered.csv


In [40]:
import numpy as np

def metrics_for(df_true, pred_col):
    y_true = df_true["labels"].tolist()
    y_pred = df_true[pred_col].tolist()
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, labels=["HYPHEN"], average="macro", zero_division=0)
    return acc, p, r, f1

rows = []
for name, df_eval, sent_pred in [
    ("mBERT",     mbert_eval,  mbert_sent_pred),
    ("IndicBERT", indic_eval,  indic_sent_pred),
    ("BERT",      bertc_eval,  bertc_sent_pred),
    ("XLM-R",     xlmr_eval,   xlmr_sent_pred),
]:
    acc, p, r, f1 = metrics_for(df_eval, [c for c in df_eval.columns if c.endswith("_pred")][0])
    s_acc = sum(p_==t_ for p_, t_ in zip(sent_pred, true_sentences)) / len(true_sentences)
    rows.append([name, acc, p, r, f1, s_acc])

summary = pd.DataFrame(rows, columns=["Model", "Token_Acc", "HYPHEN_P", "HYPHEN_R", "HYPHEN_F1", "Sentence_Exact"])
summary = summary.sort_values("HYPHEN_F1", ascending=False).reset_index(drop=True)
summary


Unnamed: 0,Model,Token_Acc,HYPHEN_P,HYPHEN_R,HYPHEN_F1,Sentence_Exact
0,IndicBERT,0.905588,0.90018,0.921731,0.910828,0.647059
1,XLM-R,0.907033,0.916123,0.905157,0.910607,0.649321
2,mBERT,0.905588,0.906764,0.913444,0.910092,0.638009
3,BERT,0.75578,0.723207,0.86372,0.787243,0.337104
