In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

%pip uninstall torch torchvision torchaudio -y -q
%pip cache purge -q
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 -q
%pip install transformers datasets evaluate seqeval -q

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("⚠️ GPU not detected — go to Runtime > Change runtime type > GPU")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.4/780.4 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m119.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m113.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ===============================
# CELL 2: Imports
# ===============================
import os
import json
import numpy as np
import pandas as pd
from ast import literal_eval
from collections import Counter

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import torch
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)


In [None]:
# ===============================
# CELL 3: Device Setup
# ===============================
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Using device: {DEVICE}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
Using device: cuda
GPU: Tesla T4


In [None]:
# ===============================
# CELL 4: Config & Paths
# ===============================

DATA_CSV = "/content/drive/MyDrive/URTOX_v2.csv"


SAVE_DIR = "/content/drive/MyDrive/Urtox_attempt1"
MODEL_NAME = "xlm-roberta-base"
MAX_LENGTH = 128
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 32
NUM_EPOCHS = 8
LEARNING_RATE = 2e-5

os.makedirs(SAVE_DIR, exist_ok=True)

In [None]:
# ===============================
# CELL 5: Load Dataset
# ===============================
df = pd.read_csv(DATA_CSV)
print(f"✅ Dataset loaded: {len(df)} rows")
print(f"Columns: {df.columns.tolist()}")
print(df.head(2))

def ensure_list(example):
    if isinstance(example["tokens"], str):
        try:
            example["tokens"] = literal_eval(example["tokens"])
        except (ValueError, SyntaxError) as e:
            print(f"Warning: Could not parse tokens for id {example.get('id', 'N/A')}: {e}. Setting to empty list.")
            example["tokens"] = []

    if isinstance(example["BIO_tags"], str):
        try:
            example["BIO_tags"] = literal_eval(example["BIO_tags"])
        except (ValueError, SyntaxError) as e:
            print(f"Warning: Could not parse BIO_tags for id {example.get('id', 'N/A')}: {e}. Setting to empty list.")
            example["BIO_tags"] = []
    return example

dataset = Dataset.from_pandas(df)
dataset = dataset.map(ensure_list)

✅ Dataset loaded: 14337 rows
Columns: ['id', 'text', 'label', 'sub_label', 'toxic_spans', 'tokens', 'toxic_list', 'BIO_tags']
     id                                               text      label  \
0  5003  ایک جملہ ھی حضرت علی علیہ السلام کی فضیلت کے ل...  non_toxic   
1  5005  علامہ یٰسین صاحب آپ نے جس زاویے سے حضرت علی عل...  non_toxic   

  sub_label toxic_spans                                             tokens  \
0    normal        \n[]  ['ایک', 'جملہ', 'ھی', 'حضرت', 'علی', 'علیہ', '...   
1    normal          []  ['علامہ', 'یٰسین', 'صاحب', 'آپ', 'نے', 'جس', '...   

  toxic_list                                           BIO_tags  
0     ['[]']  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...  
1     ['[]']  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...  


Map:   0%|          | 0/14337 [00:00<?, ? examples/s]



In [None]:
# ===============================
# CELL 6: Label Mappings & Tokenizer
# ===============================
label_list = ["O", "B-Toxic", "I-Toxic"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ===============================
# CELL 7: Tokenize & Align Labels
# ===============================
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["BIO_tags"]
    tokenized_input = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )
    word_ids = tokenized_input.word_ids()
    label_ids = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx == previous_word_idx:
            label_ids.append(label_ids[-1])  # copy parent word's label
        else:
            # Ensure word_idx is within the bounds of labels
            if word_idx < len(labels):
                label_ids.append(label2id[labels[word_idx]])
            else:
                # If word_idx is out of bounds, treat it as an ignored token
                label_ids.append(-100)
        previous_word_idx = word_idx
    tokenized_input["labels"] = label_ids
    return tokenized_input

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

# Remove all non-tensor columns
cols_to_remove = [col for col in tokenized_dataset.column_names
                  if col not in ["input_ids", "attention_mask", "labels"]]
tokenized_dataset = tokenized_dataset.remove_columns(cols_to_remove)

Map:   0%|          | 0/14337 [00:00<?, ? examples/s]

In [None]:
# ===============================
# CELL 8: Split Dataset
# ===============================
dataset_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_split["train"]
eval_dataset  = dataset_split["test"]

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch",  columns=["input_ids", "attention_mask", "labels"])

print(f"Train size: {len(train_dataset)}, Eval size: {len(eval_dataset)}")

Train size: 11469, Eval size: 2868


In [None]:
# ===============================
# CELL 9: Load Model
# ===============================
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)
model.to(DEVICE)
print("✅ Model loaded and moved to", DEVICE)

# ===============================
# CELL 10: Training Arguments
# ===============================
training_args = TrainingArguments(
    output_dir="/content/toxic_model_results",
    do_eval=True,

    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_dir="/content/logs",
    logging_steps=50,
    save_total_limit=2,
  eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
warmup_ratio=0.1,
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
    report_to="none",
    fp16=torch.cuda.is_available(),    # Enable mixed precision on GPU
)

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForTokenClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
classifier.weight           | MISSING    | 
classifier.bias             | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


✅ Model loaded and moved to cuda


In [None]:
# ===============================
# CELL 11: Data Collator & Metrics
# ===============================
data_collator = DataCollatorForTokenClassification(tokenizer)
seqeval = evaluate.load("seqeval")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)
    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall":    results["overall_recall"],
        "f1":        results["overall_f1"],
        "accuracy":  results["overall_accuracy"],
    }

# ===============================
# CELL 12: Trainer & Train
# ===============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("🚀 Starting training...")
trainer.train()

# ===============================
# CELL 13: Save Model
# ===============================
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"✅ Training complete. Model saved at {SAVE_DIR}")

# ===============================
# CELL 14: Reload Model for Inference
# ===============================
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)
model = AutoModelForTokenClassification.from_pretrained(SAVE_DIR)
model.to(DEVICE)
model.eval()
print("✅ Model reloaded for inference on", DEVICE)

# ===============================
# CELL 15: Evaluate on Test Set
# ===============================
print("\n📊 Evaluating on test set...")
eval_results = trainer.evaluate()
print("\n🎯 Evaluation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}" if isinstance(value, float) else f"  {key}: {value}")

# ===============================
# CELL 16: Inference Function
# ===============================
def predict_toxic_spans(text):
    """Predict toxic spans in Urdu text. Returns list of toxic span dicts."""
    words = text.strip().split()
    if not words:
        return []

    # Tokenize first to get word_ids BEFORE moving to device
    encoding = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )

    # Get word_ids before moving tensors to GPU
    word_ids = encoding.word_ids(batch_index=0)

    # Move tensors to device
    model_inputs = {k: v.to(DEVICE) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**model_inputs)
        predictions = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()

    predicted_labels = [model.config.id2label[p] for p in predictions]

    toxic_spans = []
    current_span = None
    previous_word_id = None

    for label, word_id in zip(predicted_labels, word_ids):
        if word_id is None:
            continue
        if word_id == previous_word_id:
            continue  # Skip subword tokens

        word = words[word_id]

        if label.startswith("B-"):
            if current_span:
                toxic_spans.append(current_span)
            current_span = {"label": label[2:], "words": [word]}
        elif label.startswith("I-") and current_span and current_span["label"] == label[2:]:
            current_span["words"].append(word)
        else:
            if current_span:
                toxic_spans.append(current_span)
                current_span = None

        previous_word_id = word_id

    if current_span:
        toxic_spans.append(current_span)

    return [{"text": " ".join(s["words"]), "label": s["label"]}
            for s in toxic_spans]

# ===============================
# CELL 17: Test on Urdu Sentences
# ===============================
test_sentences = [
    "تم بہت احمق ہو اور کوئی تمہیں پسند نہیں کرتا",
    "یہ ایک عام جملہ ہے جس میں کوئی زہریلا پن نہیں ہے",
    "چپ کرو تم بیوقوف ہو",
    "میں آپ کی رائے سے احترام کے ساتھ اختلاف کرتا ہوں",
    "اپنے آپ کو مار ڈالو ہارے ہوئے",
    "تم واقعی میں کمینے ہو",
    "آپ کا دن اچھا گزرے",
    "کنجر کی اولاد تمہاری ماں کو بھی شرم نہیں آتی",
]

print("\n🧪 Urdu Sample Testing:\n")
for i, sentence in enumerate(test_sentences, 1):
    print(f"مثال {i}: {sentence}")
    spans = predict_toxic_spans(sentence)
    if spans:
        print("  ⚠️ زہریلے الفاظ:")
        for span in spans:
            print(f"    - '{span['text']}' [{span['label']}]")
    else:
        print("  ✅ کوئی زہریلا پن نہیں ملا")
    print()

# ===============================
# CELL 18: Save Predictions on Test Set
# ===============================
print("\n💾 Saving test-set predictions...")
test_predictions = []

# Safely get original indices (handle both old and new HuggingFace Datasets versions)
if hasattr(eval_dataset, 'indices') and eval_dataset.indices is not None:
    original_indices = eval_dataset.indices
else:
    # Fallback: use sequential indices
    original_indices = list(range(len(eval_dataset)))

for i, original_idx in enumerate(original_indices):
    try:
        # Try to get original text from df; fallback gracefully
        if "text" in df.columns:
            original_text = df.iloc[original_idx]["text"]
        else:
            # If there's no 'text' column, reconstruct from tokens
            row_tokens = df.iloc[original_idx]["tokens"]
            if isinstance(row_tokens, str):
                row_tokens = literal_eval(row_tokens)
            original_text = " ".join(row_tokens)
    except Exception:
        original_text = f"[Row {original_idx}]"

    spans = predict_toxic_spans(original_text)
    test_predictions.append({
        "id": int(original_idx),
        "text": original_text,
        "toxic_spans": spans,
        "num_toxic_spans": len(spans)
    })

output_path = "/content/drive/MyDrive/urdu_test_predictions.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(test_predictions, f, indent=2, ensure_ascii=False)

print(f"✅ Predictions saved to {output_path}")

# ===============================
# CELL 19: Summary Statistics
# ===============================
total_texts       = len(test_predictions)
texts_with_tox    = sum(p["num_toxic_spans"] > 0 for p in test_predictions)
total_toxic_spans = sum(p["num_toxic_spans"]     for p in test_predictions)

print("\n📈 Summary Statistics:")
print(f"  Total texts analyzed : {total_texts}")
print(f"  Texts with toxicity  : {texts_with_tox} ({texts_with_tox/total_texts*100:.1f}%))")
print(f"  Total toxic spans    : {total_toxic_spans}")
print(f"  Avg spans per text   : {total_toxic_spans/total_texts:.2f}")

all_labels   = [s["label"] for p in test_predictions for s in p["toxic_spans"]]
label_counts = Counter(all_labels)

print("\n🏷️ Label Distribution:")
if label_counts:
    for label, count in label_counts.most_common():
        print(f"  {label}: {count}")
else:
    print("  No toxic spans detected in test set.")

print("\n✅ Evaluation & Inference Complete!")

🚀 Starting training...


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,0.308106,0.243419,0.0,0.0,0.0,0.941667
200,0.197436,0.147262,0.423712,0.440627,0.432004,0.950728
300,0.153652,0.116218,0.555413,0.54573,0.550529,0.960675
400,0.129294,0.110328,0.645694,0.513391,0.571992,0.963445
500,0.110978,0.100217,0.643362,0.597524,0.619597,0.965629
600,0.103839,0.101123,0.617543,0.661698,0.638858,0.964571
700,0.101213,0.098789,0.615929,0.68191,0.647242,0.964927
800,0.090799,0.109689,0.695957,0.552299,0.615861,0.965918
900,0.093949,0.099451,0.626181,0.686458,0.654936,0.966581
1000,0.111972,0.098441,0.629066,0.679131,0.653141,0.966168


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Training complete. Model saved at /content/drive/MyDrive/Urtox_attempt1


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

✅ Model reloaded for inference on cuda

📊 Evaluating on test set...



🎯 Evaluation Results:
  eval_loss: 0.1097
  eval_precision: 0.6423
  eval_recall: 0.7155
  eval_f1: 0.6769
  eval_accuracy: 0.9673
  eval_runtime: 9.9835
  eval_samples_per_second: 287.2740
  eval_steps_per_second: 9.0150
  epoch: 8.0000

🧪 Urdu Sample Testing:

مثال 1: تم بہت احمق ہو اور کوئی تمہیں پسند نہیں کرتا
  ⚠️ زہریلے الفاظ:
    - 'احمق' [Toxic]

مثال 2: یہ ایک عام جملہ ہے جس میں کوئی زہریلا پن نہیں ہے
  ⚠️ زہریلے الفاظ:
    - 'زہریلا پن' [Toxic]

مثال 3: چپ کرو تم بیوقوف ہو
  ⚠️ زہریلے الفاظ:
    - 'بیوقوف' [Toxic]

مثال 4: میں آپ کی رائے سے احترام کے ساتھ اختلاف کرتا ہوں
  ✅ کوئی زہریلا پن نہیں ملا

مثال 5: اپنے آپ کو مار ڈالو ہارے ہوئے
  ⚠️ زہریلے الفاظ:
    - 'مار ڈالو' [Toxic]
    - 'ہارے ہوئے' [Toxic]

مثال 6: تم واقعی میں کمینے ہو
  ⚠️ زہریلے الفاظ:
    - 'کمینے' [Toxic]

مثال 7: آپ کا دن اچھا گزرے
  ✅ کوئی زہریلا پن نہیں ملا

مثال 8: کنجر کی اولاد تمہاری ماں کو بھی شرم نہیں آتی
  ⚠️ زہریلے الفاظ:
    - 'کنجر کی اولاد' [Toxic]


💾 Saving test-set predictions...
✅ Predic