In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import Dataset
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Dataset1_75k_biotagged.csv")
dataset = Dataset.from_pandas(df)


In [None]:
label_list = ["O", "B-Toxic", "I-Toxic"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


In [None]:
# ===============================
# ‚úÖ CELL 1: Model Training + Auto-Save to Drive
# ===============================

!pip install -q evaluate seqeval

from transformers import (
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
import numpy as np
import evaluate
from ast import literal_eval
from datasets import Dataset
import os
import shutil

# ‚úÖ 0. Mount Google Drive
print("üìÅ Mounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
print("‚úÖ Drive mounted successfully!\n")

# ‚úÖ 1. Load tokenizer
print("üî§ Loading tokenizer: xlm-roberta-base")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# ‚úÖ 2. Load model
print("ü§ñ Loading base model: xlm-roberta-base")
model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# ‚úÖ 3. Training arguments
training_args = TrainingArguments(
    output_dir="./toxic_model_results",
    do_eval=True,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
    report_to="none"
)

# ‚úÖ 3. Data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# ‚úÖ 4. Load metric
seqeval = evaluate.load("seqeval")

# ‚úÖ 5. Metric computation
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# ‚úÖ 6. Ensure list conversion
def ensure_list(example):
    if isinstance(example["tokens"], str):
        example["tokens"] = literal_eval(example["tokens"])
    if isinstance(example["BIO_tags"], str):
        example["BIO_tags"] = literal_eval(example["BIO_tags"])
    return example

print("üìä Preparing dataset...")
dataset = dataset.map(ensure_list)

# ‚úÖ 7. Tokenization + Label Alignment
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["BIO_tags"]

    tokenized_input = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    word_ids = tokenized_input.word_ids()
    label_ids = []

    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx == previous_word_idx:
            label_ids.append(-100)
        else:
            label_ids.append(label2id[labels[word_idx]])
        previous_word_idx = word_idx

    tokenized_input["labels"] = label_ids
    return tokenized_input

# ‚úÖ 8. Map tokenization
print("üîÑ Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

# ‚úÖ 9. Remove string columns
tokenized_dataset = tokenized_dataset.remove_columns(["tokens", "BIO_tags"])

# ‚úÖ 10. Split dataset
dataset_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print(f"üìà Train samples: {len(train_dataset)}, Eval samples: {len(eval_dataset)}")

# ‚úÖ 11. Set dataset format
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# ‚úÖ 12. Initialize Trainer
print("\nüöÄ Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ‚úÖ 13. Start training
print("\n" + "="*60)
print("üèãÔ∏è  STARTING TRAINING")
print("="*60 + "\n")
trainer.train()

# ‚úÖ 14. Save model locally
LOCAL_PATH = "./fine_tuned_toxic_span_model"
print(f"\nüíæ Saving model locally to: {LOCAL_PATH}")
trainer.save_model(LOCAL_PATH)
tokenizer.save_pretrained(LOCAL_PATH)
print("‚úÖ Model saved locally")

# ‚úÖ 15. Save to Google Drive (CRITICAL!)
DRIVE_PATH = "/content/drive/MyDrive/fine_tuned_toxic_span_model"
print(f"\n‚òÅÔ∏è  Copying model to Google Drive: {DRIVE_PATH}")

try:
    # Remove old version if exists
    if os.path.exists(DRIVE_PATH):
        print("   Removing old version...")
        shutil.rmtree(DRIVE_PATH)

    # Copy to Drive
    shutil.copytree(LOCAL_PATH, DRIVE_PATH)
    print("‚úÖ Model saved to Google Drive!")

except Exception as e:
    print(f"‚ö†Ô∏è  Error saving to Drive: {str(e)}")

# ‚úÖ 16. Final evaluation
print("\nüìä Final Evaluation:")
eval_results = trainer.evaluate()
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

print("\n" + "="*60)
print("‚úÖ TRAINING COMPLETE!")
print("="*60)
print(f"\nüìç Model saved to:")
print(f"   Local: {LOCAL_PATH}")
print(f"   Drive: {DRIVE_PATH}")
print("\nüéâ Ready for inference! Run Cell 2 next.")
print("="*60)

üìÅ Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Drive mounted successfully!

üî§ Loading tokenizer: xlm-roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

ü§ñ Loading base model: xlm-roberta-base


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

üìä Preparing dataset...


Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

üîÑ Tokenizing dataset...


Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

üìà Train samples: 6000, Eval samples: 1500

üöÄ Initializing Trainer...


  trainer = Trainer(



üèãÔ∏è  STARTING TRAINING



Step,Training Loss
500,0.1452
1000,0.1084
1500,0.0988
2000,0.0794
2500,0.0657
3000,0.061
3500,0.0462



üíæ Saving model locally to: ./fine_tuned_toxic_span_model
‚úÖ Model saved locally

‚òÅÔ∏è  Copying model to Google Drive: /content/drive/MyDrive/fine_tuned_toxic_span_model
‚úÖ Model saved to Google Drive!

üìä Final Evaluation:


  eval_loss: 0.1320
  eval_precision: 0.5556
  eval_recall: 0.5439
  eval_f1: 0.5497
  eval_accuracy: 0.9672
  eval_runtime: 10.3756
  eval_samples_per_second: 144.5700
  eval_steps_per_second: 18.1190
  epoch: 5.0000

‚úÖ TRAINING COMPLETE!

üìç Model saved to:
   Local: ./fine_tuned_toxic_span_model
   Drive: /content/drive/MyDrive/fine_tuned_toxic_span_model

üéâ Ready for inference! Run Cell 2 next.


In [None]:
# ===============================
# ‚úÖ CELL 2: Urdu Toxic Span Detection - Inference (With Highlighting)
# ===============================

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import numpy as np
import os
from IPython.display import display, HTML

# ‚úÖ 1. Smart model loading (works after reconnection)
def load_model():
    """
    Loads model from Drive or local storage.
    """
    print("üîç Searching for trained model...")

    # Check Drive first (persistent), then local (temporary)
    possible_paths = [
        "/content/drive/MyDrive/fine_tuned_toxic_span_model",  # Drive (priority)
        "./fine_tuned_toxic_span_model",  # Local
    ]

    model_path = None
    for path in possible_paths:
        if os.path.exists(path):
            print(f"‚úÖ Found model at: {path}")
            model_path = path
            break

    if model_path is None:
        print("\n‚ùå Model not found!")
        print("üìã Mount Google Drive if needed:")
        print("   from google.colab import drive")
        print("   drive.mount('/content/drive')")
        raise FileNotFoundError("Model not found")

    # Load model
    print(f"üìÇ Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
    model = AutoModelForTokenClassification.from_pretrained(model_path, local_files_only=True)
    model.eval()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    print(f"‚úÖ Model loaded on {device}\n")
    return model, tokenizer, device

# Load the model
model, tokenizer, device = load_model()

# ‚úÖ 2. Inference function with highlighting
def predict_toxic_spans_with_highlight(text):
    """
    Predicts toxic spans and returns highlighted text.
    """
    # Tokenize
    encoding = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    tokens = {k: v.to(device) for k, v in encoding.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**tokens)
        predictions = torch.argmax(outputs.logits, dim=2)

    # Decode predictions
    predicted_labels = [model.config.id2label[p.item()] for p in predictions[0]]
    word_ids = encoding.word_ids()
    tokens_text = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])

    # Build highlighted text
    highlighted_parts = []
    current_word = []
    current_label = "O"
    prev_word_id = None

    for idx, (token, label, word_id) in enumerate(zip(tokens_text, predicted_labels, word_ids)):
        # Skip special tokens
        if token in ["<s>", "</s>", "<pad>"]:
            continue

        # Check if we're starting a new word
        if word_id != prev_word_id and current_word:
            # Process previous word
            word_text = tokenizer.convert_tokens_to_string(current_word)
            if current_label.startswith("B-") or current_label.startswith("I-"):
                label_name = current_label.split("-")[1] if "-" in current_label else "TOXIC"
                highlighted_parts.append(f'<span style="background-color: #ff6b6b; color: white; padding: 2px 4px; border-radius: 3px; font-weight: bold;" title="{label_name}">{word_text}</span>')
            else:
                highlighted_parts.append(word_text)

            current_word = []
            current_label = "O"

        # Add token to current word
        if word_id is not None:
            current_word.append(token)
            if label.startswith("B-") or label.startswith("I-"):
                current_label = label

        prev_word_id = word_id

    # Process last word
    if current_word:
        word_text = tokenizer.convert_tokens_to_string(current_word)
        if current_label.startswith("B-") or current_label.startswith("I-"):
            label_name = current_label.split("-")[1] if "-" in current_label else "TOXIC"
            highlighted_parts.append(f'<span style="background-color: #ff6b6b; color: white; padding: 2px 4px; border-radius: 3px; font-weight: bold;" title="{label_name}">{word_text}</span>')
        else:
            highlighted_parts.append(word_text)

    # Join parts
    highlighted_text = " ".join(highlighted_parts)

    # Also return toxic spans info
    toxic_spans = []
    current_span = None

    for idx, (token, label, word_id) in enumerate(zip(tokens_text, predicted_labels, word_ids)):
        if label.startswith("B-"):
            if current_span:
                toxic_spans.append(current_span)
            current_span = {
                "label": label[2:],
                "tokens": [token],
            }
        elif label.startswith("I-") and current_span:
            current_span["tokens"].append(token)
        elif current_span:
            toxic_spans.append(current_span)
            current_span = None

    if current_span:
        toxic_spans.append(current_span)

    # Format toxic spans
    toxic_info = []
    for span in toxic_spans:
        span_text = tokenizer.convert_tokens_to_string(span["tokens"])
        toxic_info.append({"text": span_text.strip(), "label": span["label"]})

    return highlighted_text, toxic_info

# ‚úÖ 3. Display function for better visualization
def display_toxic_analysis(text):
    """
    Displays text with toxic spans highlighted in color.
    """
    highlighted_text, toxic_info = predict_toxic_spans_with_highlight(text)

    # Create HTML output
    html_output = f"""
    <div style="font-family: 'Arial', sans-serif; padding: 15px; background-color: #f8f9fa; border-radius: 8px; margin: 10px 0;">
       <div style="font-size: 16px; line-height: 1.8; direction: rtl; text-align: right; color: #212529;">
            {highlighted_text}
        </div>
    """

    if toxic_info:
        html_output += """
        <div style="margin-top: 15px; padding-top: 10px; border-top: 2px solid #dee2e6;">
            <strong style="color: #dc3545;">‚ö†Ô∏è ÿ≤€Åÿ±€åŸÑ€í ÿßŸÑŸÅÿßÿ∏ (Toxic spans detected):</strong>
            <ul style="margin-top: 8px;">
        """
        for span in toxic_info:
            html_output += f'<li style="color: #495057;"><strong>{span["text"]}</strong> <span style="color: #6c757d;">[{span["label"]}]</span></li>'
        html_output += "</ul>"
    else:
        html_output += """
        <div style="margin-top: 15px; padding-top: 10px; border-top: 2px solid #dee2e6; color: #28a745;">
            <strong>‚úÖ ⁄©Ÿàÿ¶€å ÿ≤€Åÿ±€åŸÑÿß ŸæŸÜ ŸÜ€Å€å⁄∫ ŸÖŸÑÿß (No toxicity detected)</strong>
        </div>
        """

    html_output += "</div>"

    display(HTML(html_output))
    return toxic_info

# ‚úÖ 4. Test with Urdu example sentences
test_sentences = [
    "ÿ™ŸÖ ÿ®€Åÿ™ ÿßÿ≠ŸÖŸÇ €ÅŸà ÿßŸàÿ± ⁄©Ÿàÿ¶€å ÿ™ŸÖ€Å€å⁄∫ Ÿæÿ≥ŸÜÿØ ŸÜ€Å€å⁄∫ ⁄©ÿ±ÿ™ÿß",
    "ŸÖ⁄©ŸÖŸÑ ÿ∑Ÿàÿ± Ÿæÿ± ŸÜÿßÿß€ÅŸÑ ÿå €ÅŸÖÿßÿ±€í ÿ®€åŸàÿ±Ÿà⁄©ÿ±€åŸπÿ≥ ⁄©€í Ÿæÿßÿ≥ ⁄©Ÿàÿ¶€å ÿπÿ∞ÿ± ŸÜ€Å€å⁄∫ €Å€í ÿßŸàÿ± Ÿà€Å ŸàÿßŸÇÿπ€å ŸÖ€å⁄∫ ⁄Ø⁄æŸπ€åÿß ŸæŸÜ ŸÜ€Å€å⁄∫ ÿØ€åÿ™€í €Å€å⁄∫!",
    "ÿßÿ≥ ŸÑÿ¶€í ÿ¨ÿ≥ ÿßŸìÿØŸÖ€å ŸÜ€í ÿßÿ≥ ÿ±ÿßÿ™ ŸÖ€å⁄∫ ÿπÿ®ÿßÿØÿ™ ⁄©ÿß ÿß€Åÿ™ŸÖÿßŸÖ ⁄©€åÿß ⁄ØŸà€åÿß ÿßÿ≥ ŸÜ€í ÿßŸÑŸÑ€Å ÿ™ÿπÿßŸÑ€åŸ∞ ⁄©ÿß ÿ¥⁄©ÿ±€å€Å ÿßÿØÿß ⁄©€åÿß ÿßŸàÿ± ÿßŸæŸÜ€í ÿπŸÖŸÑ ÿ≥€í €å€Å ÿ´ÿßÿ®ÿ™ ⁄©€åÿß ⁄©€Å ÿßÿ≥ ⁄©€í ÿØŸÑ ŸÖ€å⁄∫ ŸÇÿ±ÿßŸìŸÜ ŸÖÿ¨€åÿØ ⁄©€å ÿµÿ≠€åÿ≠ ŸÇÿØÿ±ŸàŸÇ€åŸÖÿ™ ⁄©ÿß ÿßÿ≠ÿ≥ÿßÿ≥ ŸÖŸàÿ¨ŸàÿØ €Å€í€î",
    "hmaray news anchors kafi achay hain ,ic bat pr tareef bnti hai judge sahab",
    "Mujhe is faasiq ganwaar, ujhad, bad-tahzeeb, bad-atwaar, ghabi, paagal se kabhi hamdardi nahi thi.",
    "ŸÖ€å⁄∫ ÿ¢Ÿæ ⁄©€å ÿ±ÿßÿ¶€í ÿ≥€í ÿßÿ≠ÿ™ÿ±ÿßŸÖ ⁄©€í ÿ≥ÿßÿ™⁄æ ÿßÿÆÿ™ŸÑÿßŸÅ ⁄©ÿ±ÿ™ÿß €ÅŸà⁄∫",
    "ÿß€å⁄© ÿ≥ÿßÿ®ŸÇ ŸÅŸàÿ¨€åŸà⁄∫ ⁄©€å ÿ≠€åÿ´€åÿ™ ÿ≥€í ÿå ŸÖÿ¨⁄æ€í ŸÑ⁄Øÿ™ÿß €Å€í ⁄©€Å ÿßŸÜ ÿ®€Åÿ™ €Å€å ÿÆÿ®€åÿ´ ÿßŸÑŸÜŸÅÿ≥ ŸÜŸàÿ¨ŸàÿßŸÜŸà⁄∫ ⁄©€å ŸÅŸàÿ¨ ŸÖ€å⁄∫ ⁄©Ÿàÿ¶€å ÿ¨⁄Ø€Å ŸÜ€Å€å⁄∫ €Å€í€î",
    "ÿ¢Ÿæ ⁄©ÿß ÿØŸÜ ÿß⁄Ü⁄æÿß ⁄Øÿ≤ÿ±€í",
]

print("üß™ Testing on Urdu sample sentences:\n")
print("="*60)

for i, sentence in enumerate(test_sentences, 1):
    print(f"\n{'='*60}")
    print(f"ŸÖÿ´ÿßŸÑ {i} (Example {i}):")
    print(f"{'='*60}")
    display_toxic_analysis(sentence)

# ‚úÖ 5. Text-based output (for non-Jupyter environments)
def print_toxic_analysis_text(text):
    """
    Prints analysis in text format with ANSI colors (for terminals).
    """
    highlighted_text, toxic_info = predict_toxic_spans_with_highlight(text)

    # Remove HTML tags for plain text version
    import re
    plain_text = re.sub(r'<span[^>]*>(.*?)</span>', r'[\1]', highlighted_text)
    plain_text = re.sub(r'<[^>]+>', '', plain_text)

    print(f"\nText: {text}")
    print(f"Highlighted: {plain_text}")

    if toxic_info:
        print("\n‚ö†Ô∏è  Toxic spans detected:")
        for span in toxic_info:
            print(f"  - '{span['text']}' [{span['label']}]")
    else:
        print("\n‚úÖ No toxicity detected")

# ‚úÖ 6. Interactive inference with highlighting
print("\n\n" + "="*60)
print("üéÆ ÿßŸÜŸπÿ±ÿß€å⁄©ŸπŸà ŸÖŸà⁄à (Interactive Mode)")
print("="*60)
print("ÿßÿ±ÿØŸà ŸÖÿ™ŸÜ ÿØÿ±ÿ¨ ⁄©ÿ±€å⁄∫ / Enter Urdu text to analyze")
print("Type 'quit' to exit\n")

while True:
    user_input = input("\nText: ")
    if user_input.lower() in ['quit', 'exit', 'q', 'ÿ®ŸÜÿØ']:
        break

    if not user_input.strip():
        continue

    print("\n" + "-"*60)
    display_toxic_analysis(user_input)
    print("-"*60)

print("\n‚úÖ ÿ™ÿ¨ÿ≤€å€Å ŸÖ⁄©ŸÖŸÑ! (Analysis complete!)")

üîç Searching for trained model...
‚úÖ Found model at: /content/drive/MyDrive/fine_tuned_toxic_span_model
üìÇ Loading model...
‚úÖ Model loaded on cuda

üß™ Testing on Urdu sample sentences:


ŸÖÿ´ÿßŸÑ 1 (Example 1):



ŸÖÿ´ÿßŸÑ 2 (Example 2):



ŸÖÿ´ÿßŸÑ 3 (Example 3):



ŸÖÿ´ÿßŸÑ 4 (Example 4):



ŸÖÿ´ÿßŸÑ 5 (Example 5):



ŸÖÿ´ÿßŸÑ 6 (Example 6):



ŸÖÿ´ÿßŸÑ 7 (Example 7):



ŸÖÿ´ÿßŸÑ 8 (Example 8):




üéÆ ÿßŸÜŸπÿ±ÿß€å⁄©ŸπŸà ŸÖŸà⁄à (Interactive Mode)
ÿßÿ±ÿØŸà ŸÖÿ™ŸÜ ÿØÿ±ÿ¨ ⁄©ÿ±€å⁄∫ / Enter Urdu text to analyze
Type 'quit' to exit


Text: ⁄Ø⁄æŸπ€åÿß ŸæŸÜ ⁄©€í ÿ™ŸÖÿßŸÖ ÿ®€å⁄©ÿßÿ± ⁄à⁄æ€åÿ±€î

------------------------------------------------------------


------------------------------------------------------------

Text: ÿ¥€åÿπ€Å ÿßŸàÿ± ÿ≥ŸÜ€å ÿØŸàŸÜŸà⁄∫ ÿßÿ≥ŸÑÿßŸÖ ⁄©€í ŸÖÿßŸÜŸÜ€í ŸàÿßŸÑ€í €Å€å⁄∫€î

------------------------------------------------------------


------------------------------------------------------------


In [None]:
# ===============================
# ‚úÖ CELL 3: Audio Processing - Beep Toxic Words (Fast Version)
# ===============================

print("üì¶ Installing dependencies...")
!pip install -q openai-whisper pydub 2>&1 | grep -v "already satisfied" || true
!apt-get install -y ffmpeg > /dev/null 2>&1
print("‚úÖ Dependencies installed!\n")

import torch
from pydub import AudioSegment
from pydub.generators import Sine
import json
import os
import whisper
from IPython.display import Audio, display

# ‚úÖ 1. Load Whisper model (ONCE at the start)
print("üé§ Loading Whisper model (tiny - fast)...")
whisper_model = whisper.load_model("tiny")  # Options: tiny, base, small
print("‚úÖ Whisper model loaded!\n")

# ‚úÖ 2. Upload audio file
def upload_audio():
    """Upload audio file."""
    from google.colab import files

    print("üì§ Upload your Urdu audio file...")
    uploaded = files.upload()

    if not uploaded:
        return None

    audio_filename = list(uploaded.keys())[0]
    print(f"‚úÖ Uploaded: {audio_filename}\n")
    return audio_filename

# ‚úÖ 3. Transcribe with word timestamps
def transcribe_audio(audio_path):
    """Transcribe audio using Whisper."""
    print(f"üéôÔ∏è  Transcribing: {audio_path}")

    # Transcribe
    result = whisper_model.transcribe(
        audio_path,
        language="ur",
        word_timestamps=True,
        verbose=False
    )

    # Extract word-level timestamps
    word_segments = []
    for segment in result["segments"]:
        if "words" in segment:
            for word in segment["words"]:
                word_segments.append({
                    "word": word["word"].strip(),
                    "start": word["start"],
                    "end": word["end"]
                })

    full_text = result["text"].strip()

    print(f"‚úÖ Transcription complete!")
    print(f"üìù Text: {full_text}\n")

    return full_text, word_segments

# ‚úÖ 4. Detect toxic words
def detect_toxic_words(text, word_segments):
    """Detect toxic words and match to timestamps."""
    print("üîç Detecting toxic words...")

    # Use your trained model
    _, toxic_info = predict_toxic_spans_with_highlight(text)

    if not toxic_info:
        print("‚úÖ No toxic words detected!\n")
        return []

    print(f"‚ö†Ô∏è  Found {len(toxic_info)} toxic span(s)")

    # Match toxic words to timestamps
    toxic_timestamps = []

    for toxic in toxic_info:
        toxic_text = toxic["text"].strip().lower()

        # Find in word segments
        for segment in word_segments:
            segment_word = segment["word"].strip().lower()

            # Match (exact or contains)
            if toxic_text in segment_word or segment_word in toxic_text:
                toxic_timestamps.append({
                    "word": segment["word"],
                    "start": segment["start"],
                    "end": segment["end"],
                    "label": toxic["label"]
                })
                print(f"   üî¥ '{segment['word']}' at {segment['start']:.1f}s [{toxic['label']}]")

    print()
    return toxic_timestamps

# ‚úÖ 5. Generate beep
def generate_beep(duration_ms):
    """Generate beep sound."""
    return Sine(1000).to_audio_segment(duration=duration_ms) - 10

# ‚úÖ 6. Beep toxic words
def beep_toxic_words(audio_path, toxic_timestamps, output_path="cleaned_audio.wav"):
    """Replace toxic words with beeps."""
    print(f"üîä Loading audio: {audio_path}")
    audio = AudioSegment.from_file(audio_path)

    if not toxic_timestamps:
        print("‚úÖ No beeping needed - saving original\n")
        audio.export(output_path, format="wav")
        return output_path

    print(f"üîá Beeping {len(toxic_timestamps)} word(s)...")

    # Sort in reverse order
    toxic_timestamps = sorted(toxic_timestamps, key=lambda x: x["start"], reverse=True)

    # Replace with beeps
    for toxic in toxic_timestamps:
        start_ms = int(toxic["start"] * 1000)
        end_ms = int(toxic["end"] * 1000)
        duration_ms = end_ms - start_ms

        beep = generate_beep(duration_ms)
        audio = audio[:start_ms] + beep + audio[end_ms:]
        print(f"   ‚úì Beeped '{toxic['word']}' at {toxic['start']:.1f}s")

    print(f"\nüíæ Saving cleaned audio: {output_path}")
    audio.export(output_path, format="wav")
    print("‚úÖ Done!\n")

    return output_path

# ‚úÖ 7. Complete pipeline
def process_audio(audio_path):
    """Complete processing pipeline."""
    print("="*60)
    print("üéµ PROCESSING AUDIO")
    print("="*60 + "\n")

    # Step 1: Transcribe
    text, word_segments = transcribe_audio(audio_path)

    # Step 2: Detect toxic words
    toxic_timestamps = detect_toxic_words(text, word_segments)

    # Step 3: Beep toxic words
    output_path = "cleaned_audio.wav"
    cleaned_audio = beep_toxic_words(audio_path, toxic_timestamps, output_path)

    # Step 4: Display results
    print("="*60)
    print("‚úÖ RESULTS")
    print("="*60)
    print(f"üìù Transcript: {text}")
    print(f"‚ö†Ô∏è  Toxic words: {len(toxic_timestamps)}")
    print("="*60 + "\n")

    # Display audio players
    print("üéß Original Audio:")
    display(Audio(audio_path, autoplay=False))

    print("\nüéß Cleaned Audio (toxic words beeped):")
    display(Audio(cleaned_audio, autoplay=False))

    # Save report
    report = {
        "transcript": text,
        "toxic_words": toxic_timestamps,
        "total_toxic": len(toxic_timestamps)
    }

    with open("audio_report.json", "w", encoding="utf-8") as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print("\nüìÑ Report saved: audio_report.json")

    # Download option
    print("\n‚¨áÔ∏è  Download cleaned audio:")
    from google.colab import files
    files.download(cleaned_audio)

    return cleaned_audio

# ‚úÖ 8. MAIN EXECUTION
print("\n" + "="*60)
print("üé§ URDU AUDIO TOXIC WORD BEEPING")
print("="*60 + "\n")

print("Choose option:")
print("1. Upload audio file")
print("2. Use file path")
print("3. Record audio (from microphone)\n")

choice = input("Enter choice (1/2/3): ").strip()

if choice == "1":
    # Upload
    audio_file = upload_audio()
    if audio_file:
        process_audio(audio_file)

elif choice == "2":
    # File path
    audio_file = input("Enter audio file path: ").strip()
    if os.path.exists(audio_file):
        process_audio(audio_file)
    else:
        print(f"‚ùå File not found: {audio_file}")

elif choice == "3":
    # Record audio
    print("\nüéôÔ∏è  Recording audio...")
    print("This will record for 10 seconds. Speak now!")

    from google.colab import output
    from base64 import b64decode
    from io import BytesIO

    # Use Colab's audio recording
    print("\nClick the microphone icon below to record:")

    RECORD = """
    const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
    const b2text = blob => new Promise(resolve => {
      const reader = new FileReader()
      reader.onloadend = e => resolve(e.srcElement.result)
      reader.readAsDataURL(blob)
    })

    var record = time => new Promise(async resolve => {
      stream = await navigator.mediaDevices.getUserMedia({ audio: true })
      recorder = new MediaRecorder(stream)
      chunks = []
      recorder.ondataavailable = e => chunks.push(e.data)
      recorder.start()
      await sleep(time)
      recorder.onstop = async ()=>{
        blob = new Blob(chunks)
        text = await b2text(blob)
        resolve(text)
      }
      recorder.stop()
    })
    """

    display(HTML('''
    <button id="record">üéôÔ∏è Record 10 seconds</button>
    <script>
    document.getElementById("record").onclick = async () => {
        const sleep = time => new Promise(resolve => setTimeout(resolve, time))
        const b2text = blob => new Promise(resolve => {
            const reader = new FileReader()
            reader.onloadend = e => resolve(e.srcElement.result)
            reader.readAsDataURL(blob)
        })

        const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
        const recorder = new MediaRecorder(stream)
        const chunks = []

        recorder.ondataavailable = e => chunks.push(e.data)
        recorder.start()

        document.getElementById("record").innerText = "üî¥ Recording... (10s)"
        await sleep(10000)

        recorder.onstop = async () => {
            const blob = new Blob(chunks, {type: 'audio/webm'})
            const text = await b2text(blob)
            google.colab.kernel.invokeFunction('notebook.save_audio', [text], {})
        }

        recorder.stop()
        document.getElementById("record").innerText = "‚úÖ Recording complete!"
    }
    </script>
    '''))

    print("\n‚ö†Ô∏è  Note: Recording feature requires manual implementation.")
    print("Please use option 1 (upload) or 2 (file path) instead.")

else:
    print("‚ùå Invalid choice!")

print("\n‚úÖ Script complete!")