In [1]:
!pip install transformers datasets -q

In [2]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from ast import literal_eval

print("✅ Imports done")
print(f"✅ GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else '❌ No GPU!'}")

✅ Imports done
✅ GPU: ❌ No GPU!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os

folder = "/content/drive/MyDrive/Urtox_attempt1"

if os.path.exists(folder):
    print("✅ Folder exists!")
    print("Files inside:")
    for f in os.listdir(folder):
        print(f"  {f}")
else:
    print("❌ Folder does not exist!")

✅ Folder exists!
Files inside:
  training_args.bin
  config.json
  tokenizer_config.json
  model.safetensors
  tokenizer.json


In [7]:
SAVE_DIR  = "/content/drive/MyDrive/Urtox_attempt1"
DEVICE    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH = 128

tokenizer  = AutoTokenizer.from_pretrained(SAVE_DIR)
text_model = AutoModelForTokenClassification.from_pretrained(SAVE_DIR)
text_model = text_model.to(DEVICE)
text_model.eval()

print("✅ Text model reloaded!")
print(f"Labels: {text_model.config.id2label}")

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

✅ Text model reloaded!
Labels: {0: 'O', 1: 'B-Toxic', 2: 'I-Toxic'}


In [16]:
df = pd.read_csv("/content/drive/MyDrive/urdu_toxic_audio_dataset.csv")

def ensure_list(x):
    if isinstance(x, str):
        try:
            return literal_eval(x)
        except:
            return []
    return x

df['tokens']   = df['tokens'].apply(ensure_list)
df['BIO_tags'] = df['BIO_tags'].apply(ensure_list)

# Same split as training (seed=42, test_size=0.2)
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_df = test_df.reset_index(drop=True)

print(f"✅ Test set size: {len(test_df)}")
print(f"Label distribution:\n{test_df['label'].value_counts()}")

✅ Test set size: 2868
Label distribution:
label
toxic        1532
non_toxic    1336
Name: count, dtype: int64


In [9]:
label2id = {"O": 0, "B-Toxic": 1, "I-Toxic": 2}
id2label = {v: k for k, v in label2id.items()}

text_probs_list  = []
text_labels_list = []

print(f"Getting text probabilities for {len(test_df)} rows...")

for idx, row in test_df.iterrows():
    words  = row['tokens']
    label  = row['label']

    if not words:
        continue

    encoding = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )

    model_inputs = {k: v.to(DEVICE) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = text_model(**model_inputs)
        # Mean pool logits across tokens → sentence level
        logits  = outputs.logits[0]  # (seq_len, num_labels)
        # Get B-Toxic probability as toxic score
        probs   = torch.softmax(logits, dim=-1)
        # Take max B-Toxic prob across all tokens as sentence toxic score
        toxic_prob     = probs[:, 1].max().item()   # B-Toxic
        non_toxic_prob = 1 - toxic_prob

    text_probs_list.append([non_toxic_prob, toxic_prob])
    text_labels_list.append(1 if label == 'toxic' else 0)

    if idx % 500 == 0:
        print(f"✅ {idx}/{len(test_df)} done")

text_probs  = np.array(text_probs_list)
text_labels = np.array(text_labels_list)

np.save("/content/drive/MyDrive/text_probs.npy",  text_probs)
np.save("/content/drive/MyDrive/text_labels.npy", text_labels)

print(f"\n✅ Saved!")
print(f"text_probs shape:  {text_probs.shape}")
print(f"text_labels shape: {text_labels.shape}")

Getting text probabilities for 2868 rows...
✅ 0/2868 done
✅ 500/2868 done
✅ 1000/2868 done
✅ 1500/2868 done
✅ 2000/2868 done
✅ 2500/2868 done

✅ Saved!
text_probs shape:  (2868, 2)
text_labels shape: (2868,)


In [10]:
audio_probs  = np.load("/content/drive/MyDrive/audio_probs.npy")
audio_labels = np.load("/content/drive/MyDrive/audio_labels.npy")
le_classes   = np.load("/content/drive/MyDrive/label_classes.npy", allow_pickle=True)

print(f"✅ Audio probs shape:  {audio_probs.shape}")
print(f"✅ Audio labels shape: {audio_labels.shape}")
print(f"✅ Classes: {le_classes}")

✅ Audio probs shape:  (2841, 2)
✅ Audio labels shape: (2841,)
✅ Classes: ['non_toxic' 'toxic']


In [11]:
# Make sure both are same length
min_len = min(len(text_probs), len(audio_probs))

text_probs_aligned  = text_probs[:min_len]
audio_probs_aligned = audio_probs[:min_len]
true_labels         = text_labels[:min_len]  # use text labels as ground truth

print(f"✅ Aligned size: {min_len}")

✅ Aligned size: 2841


In [12]:
# Text F1: 67% → weight 0.5
# Audio F1: 70% → weight 0.5
# Equal weights since both are similar

results = {}

for text_w, audio_w in [(0.5, 0.5), (0.6, 0.4), (0.7, 0.3), (0.4, 0.6)]:
    combined = text_w * text_probs_aligned + audio_w * audio_probs_aligned
    preds    = combined.argmax(axis=1)
    f1       = f1_score(true_labels, preds, average='weighted')
    results[f"text={text_w}_audio={audio_w}"] = f1
    print(f"Text: {text_w} | Audio: {audio_w} | F1: {f1:.4f}")

best_combo = max(results, key=results.get)
print(f"\n🏆 Best combination: {best_combo} → F1: {results[best_combo]:.4f}")

Text: 0.5 | Audio: 0.5 | F1: 0.7726
Text: 0.6 | Audio: 0.4 | F1: 0.7934
Text: 0.7 | Audio: 0.3 | F1: 0.7933
Text: 0.4 | Audio: 0.6 | F1: 0.7370

🏆 Best combination: text=0.6_audio=0.4 → F1: 0.7934


In [13]:
# Use best weights from above
best_text_w  = 0.6   # update based on Cell 9 output
best_audio_w = 0.4   # update based on Cell 9 output

combined_probs = best_text_w * text_probs_aligned + best_audio_w * audio_probs_aligned
final_preds    = combined_probs.argmax(axis=1)

print("📊 FINAL MULTIMODAL RESULTS:")
print(classification_report(
    true_labels,
    final_preds,
    target_names=['non_toxic', 'toxic']
))
print(f"🎯 Final Weighted F1: {f1_score(true_labels, final_preds, average='weighted'):.4f}")

print("\n📊 Individual Model Comparison:")
print(f"  Text model alone  (XLM-RoBERTa): 0.67")
print(f"  Audio model alone (Wav2Vec2):     0.70")
print(f"  Multimodal Fusion:                {f1_score(true_labels, final_preds, average='weighted'):.4f}")

📊 FINAL MULTIMODAL RESULTS:
              precision    recall  f1-score   support

   non_toxic       0.74      0.87      0.80      1327
       toxic       0.87      0.73      0.79      1514

    accuracy                           0.79      2841
   macro avg       0.80      0.80      0.79      2841
weighted avg       0.80      0.79      0.79      2841

🎯 Final Weighted F1: 0.7934

📊 Individual Model Comparison:
  Text model alone  (XLM-RoBERTa): 0.67
  Audio model alone (Wav2Vec2):     0.70
  Multimodal Fusion:                0.7934


In [14]:
results_summary = {
    "text_model_f1"  : 0.67,
    "audio_model_f1" : 0.70,
    "fusion_f1"      : float(f1_score(true_labels, final_preds, average='weighted')),
    "best_weights"   : {"text": best_text_w, "audio": best_audio_w},
    "test_size"      : min_len
}

with open("/content/drive/MyDrive/fusion_results.json", "w") as f:
    json.dump(results_summary, f, indent=2)

print("✅ Results saved!")
print(json.dumps(results_summary, indent=2))

✅ Results saved!
{
  "text_model_f1": 0.67,
  "audio_model_f1": 0.7,
  "fusion_f1": 0.7933790318514666,
  "best_weights": {
    "text": 0.6,
    "audio": 0.4
  },
  "test_size": 2841
}


In [17]:
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model

# Load audio model
processor     = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(DEVICE)
wav2vec_model.eval()

# Load audio classifier
class AudioToxicClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256, num_classes=2):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes)
        )
    def forward(self, x):
        return self.classifier(x)

audio_classifier = AudioToxicClassifier().to(DEVICE)
audio_classifier.load_state_dict(
    torch.load("/content/drive/MyDrive/audio_toxic_classifier.pt", map_location=DEVICE)
)
audio_classifier.eval()
print("✅ Audio classifier loaded!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/211 [00:00<?, ?it/s]

Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base
Key                          | Status     |  | 
-----------------------------+------------+--+-
quantizer.codevectors        | UNEXPECTED |  | 
project_q.weight             | UNEXPECTED |  | 
project_q.bias               | UNEXPECTED |  | 
quantizer.weight_proj.weight | UNEXPECTED |  | 
project_hid.weight           | UNEXPECTED |  | 
quantizer.weight_proj.bias   | UNEXPECTED |  | 
project_hid.bias             | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

✅ Audio classifier loaded!


In [20]:
def predict_full_pipeline(audio_path):
    print(f"🎙️ Audio: {audio_path}\n")

    # ============================================
    # STEP 1: Audio Model Prediction
    # ============================================
    import torchaudio
    waveform, sr = torchaudio.load(audio_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform  = resampler(waveform)

    waveform = waveform.squeeze()
    if len(waveform) > 16000 * 10:
        waveform = waveform[:16000 * 10]

    inputs = processor(
        waveform.numpy(),
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    ).to(DEVICE)

    with torch.no_grad():
        outputs   = wav2vec_model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

    tensor      = torch.tensor(embedding, dtype=torch.float32).unsqueeze(0).to(DEVICE)
    audio_out   = audio_classifier(tensor)
    audio_probs = torch.softmax(audio_out, dim=1).detach().cpu().numpy()[0]
    # ============================================
    # STEP 2: Text Model Prediction
    # ============================================
    # Since this is TTS audio, text = original text
    # For real audio you would transcribe first
    # Here we get text from the dataset by audio filename
    audio_id  = os.path.basename(audio_path).replace('.mp3', '')
    match     = df[df['id'].astype(str) == str(audio_id)]

    if len(match) == 0:
        print("❌ Could not find matching text for this audio")
        return

    text   = match.iloc[0]['text']
    tokens = match.iloc[0]['tokens']
    print(f"📝 Text: {text}\n")

    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    word_ids     = encoding.word_ids(batch_index=0)
    model_inputs = {k: v.to(DEVICE) for k, v in encoding.items()}

    with torch.no_grad():
        text_out   = text_model(**model_inputs)
        logits     = text_out.logits[0]
        probs      = torch.softmax(logits, dim=-1)
        toxic_prob = probs[:, 1].max().item()
        text_probs = [1 - toxic_prob, toxic_prob]

    # ============================================
    # STEP 3: Late Fusion
    # ============================================
    combined = 0.6 * np.array(text_probs) + 0.4 * np.array(audio_probs)
    final_pred = combined.argmax()
    final_label = 'TOXIC' if final_pred == 1 else 'NON TOXIC'
    confidence  = combined[final_pred] * 100

    # ============================================
    # STEP 4: Toxic Span Detection
    # ============================================
    toxic_spans = []
    current_span = None
    prev_word_id = None
    predictions  = torch.argmax(text_out.logits, dim=2)[0].cpu().numpy()
    pred_labels  = [text_model.config.id2label[p] for p in predictions]

    for label, word_id in zip(pred_labels, word_ids):
        if word_id is None or word_id == prev_word_id:
            continue
        word = tokens[word_id]
        if label == 'B-Toxic':
            if current_span:
                toxic_spans.append(current_span)
            current_span = [word]
        elif label == 'I-Toxic' and current_span:
            current_span.append(word)
        else:
            if current_span:
                toxic_spans.append(current_span)
                current_span = None
        prev_word_id = word_id

    if current_span:
        toxic_spans.append(current_span)

    # ============================================
    # FINAL OUTPUT
    # ============================================
    print(f"{'='*45}")
    print(f"🔍 Prediction  : {final_label}")
    print(f"📊 Confidence  : {confidence:.1f}%")
    print(f"🎵 Audio prob  : toxic={audio_probs[1]*100:.1f}% | non_toxic={audio_probs[0]*100:.1f}%")
    print(f"📝 Text prob   : toxic={text_probs[1]*100:.1f}%  | non_toxic={text_probs[0]*100:.1f}%")
    print(f"{'='*45}")

    if toxic_spans:
        print(f"⚠️  Toxic Spans Found:")
        for span in toxic_spans:
            print(f"   → '{' '.join(span)}'")
    else:
        print(f"✅ No toxic spans detected")

# ============================================
# TEST IT
# ============================================
import torchaudio

# Test with a known toxic sample
toxic_sample = df[df['label'] == 'toxic'].iloc[0]
test_path    = toxic_sample['audio_path']

predict_full_pipeline(test_path)

🎙️ Audio: /content/drive/MyDrive/urdu_toxic_audio _og/8914.mp3

📝 Text: مفتی صاحب نے گھوں کو چھیڑ دیا

🔍 Prediction  : TOXIC
📊 Confidence  : 57.5%
🎵 Audio prob  : toxic=0.2% | non_toxic=99.8%
📝 Text prob   : toxic=95.6%  | non_toxic=4.4%
⚠️  Toxic Spans Found:
   → 'گھوں کو چھیڑ دیا'


In [21]:
def predict_full_pipeline(audio_path):
    print(f"🎙️ Audio: {audio_path}\n")

    # ============================================
    # STEP 1: Audio Model Prediction
    # ============================================
    import torchaudio
    waveform, sr = torchaudio.load(audio_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform  = resampler(waveform)

    waveform = waveform.squeeze()
    if len(waveform) > 16000 * 10:
        waveform = waveform[:16000 * 10]

    inputs = processor(
        waveform.numpy(),
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    ).to(DEVICE)

    with torch.no_grad():
        outputs   = wav2vec_model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

    tensor      = torch.tensor(embedding, dtype=torch.float32).unsqueeze(0).to(DEVICE)
    audio_out   = audio_classifier(tensor)
    audio_probs = torch.softmax(audio_out, dim=1).detach().cpu().numpy()[0]
    # ============================================
    # STEP 2: Text Model Prediction
    # ============================================
    # Since this is TTS audio, text = original text
    # For real audio you would transcribe first
    # Here we get text from the dataset by audio filename
    audio_id  = os.path.basename(audio_path).replace('.mp3', '')
    match     = df[df['id'].astype(str) == str(audio_id)]

    if len(match) == 0:
        print("❌ Could not find matching text for this audio")
        return

    text   = match.iloc[0]['text']
    tokens = match.iloc[0]['tokens']
    print(f"📝 Text: {text}\n")

    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    word_ids     = encoding.word_ids(batch_index=0)
    model_inputs = {k: v.to(DEVICE) for k, v in encoding.items()}

    with torch.no_grad():
        text_out   = text_model(**model_inputs)
        logits     = text_out.logits[0]
        probs      = torch.softmax(logits, dim=-1)
        toxic_prob = probs[:, 1].max().item()
        text_probs = [1 - toxic_prob, toxic_prob]

    # ============================================
    # STEP 3: Late Fusion
    # ============================================
    combined = 0.6 * np.array(text_probs) + 0.4 * np.array(audio_probs)
    final_pred = combined.argmax()
    final_label = 'TOXIC' if final_pred == 1 else 'NON TOXIC'
    confidence  = combined[final_pred] * 100

    # ============================================
    # STEP 4: Toxic Span Detection
    # ============================================
    toxic_spans = []
    current_span = None
    prev_word_id = None
    predictions  = torch.argmax(text_out.logits, dim=2)[0].cpu().numpy()
    pred_labels  = [text_model.config.id2label[p] for p in predictions]

    for label, word_id in zip(pred_labels, word_ids):
        if word_id is None or word_id == prev_word_id:
            continue
        word = tokens[word_id]
        if label == 'B-Toxic':
            if current_span:
                toxic_spans.append(current_span)
            current_span = [word]
        elif label == 'I-Toxic' and current_span:
            current_span.append(word)
        else:
            if current_span:
                toxic_spans.append(current_span)
                current_span = None
        prev_word_id = word_id

    if current_span:
        toxic_spans.append(current_span)

    # ============================================
    # FINAL OUTPUT
    # ============================================
    print(f"{'='*45}")
    print(f"🔍 Prediction  : {final_label}")
    print(f"📊 Confidence  : {confidence:.1f}%")
    print(f"🎵 Audio prob  : toxic={audio_probs[1]*100:.1f}% | non_toxic={audio_probs[0]*100:.1f}%")
    print(f"📝 Text prob   : toxic={text_probs[1]*100:.1f}%  | non_toxic={text_probs[0]*100:.1f}%")
    print(f"{'='*45}")

    if toxic_spans:
        print(f"⚠️  Toxic Spans Found:")
        for span in toxic_spans:
            print(f"   → '{' '.join(span)}'")
    else:
        print(f"✅ No toxic spans detected")

# ============================================
# TEST IT
# ============================================
import torchaudio

# Test with a known toxic sample
toxic_sample = df[df['label'] == 'toxic'].iloc[0]
test_path    = toxic_sample['audio_path']

predict_full_pipeline("/content/drive/MyDrive/11693.mp3")

🎙️ Audio: /content/drive/MyDrive/11693.mp3

📝 Text: تم لوگوں نے دنیا  چوتیا سمجھا ہوا ہے حق حبیث انسان ()

🔍 Prediction  : TOXIC
📊 Confidence  : 86.2%
🎵 Audio prob  : toxic=71.3% | non_toxic=28.7%
📝 Text prob   : toxic=96.1%  | non_toxic=3.9%
⚠️  Toxic Spans Found:
   → 'چوتیا'


In [22]:
def predict_full_pipeline(audio_path):
    print(f"🎙️ Audio: {audio_path}\n")

    # ============================================
    # STEP 1: Audio Model Prediction
    # ============================================
    import torchaudio
    waveform, sr = torchaudio.load(audio_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform  = resampler(waveform)

    waveform = waveform.squeeze()
    if len(waveform) > 16000 * 10:
        waveform = waveform[:16000 * 10]

    inputs = processor(
        waveform.numpy(),
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    ).to(DEVICE)

    with torch.no_grad():
        outputs   = wav2vec_model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

    tensor      = torch.tensor(embedding, dtype=torch.float32).unsqueeze(0).to(DEVICE)
    audio_out   = audio_classifier(tensor)
    audio_probs = torch.softmax(audio_out, dim=1).detach().cpu().numpy()[0]
    # ============================================
    # STEP 2: Text Model Prediction
    # ============================================
    # Since this is TTS audio, text = original text
    # For real audio you would transcribe first
    # Here we get text from the dataset by audio filename
    audio_id  = os.path.basename(audio_path).replace('.mp3', '')
    match     = df[df['id'].astype(str) == str(audio_id)]

    if len(match) == 0:
        print("❌ Could not find matching text for this audio")
        return

    text   = match.iloc[0]['text']
    tokens = match.iloc[0]['tokens']
    print(f"📝 Text: {text}\n")

    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    word_ids     = encoding.word_ids(batch_index=0)
    model_inputs = {k: v.to(DEVICE) for k, v in encoding.items()}

    with torch.no_grad():
        text_out   = text_model(**model_inputs)
        logits     = text_out.logits[0]
        probs      = torch.softmax(logits, dim=-1)
        toxic_prob = probs[:, 1].max().item()
        text_probs = [1 - toxic_prob, toxic_prob]

    # ============================================
    # STEP 3: Late Fusion
    # ============================================
    combined = 0.6 * np.array(text_probs) + 0.4 * np.array(audio_probs)
    final_pred = combined.argmax()
    final_label = 'TOXIC' if final_pred == 1 else 'NON TOXIC'
    confidence  = combined[final_pred] * 100

    # ============================================
    # STEP 4: Toxic Span Detection
    # ============================================
    toxic_spans = []
    current_span = None
    prev_word_id = None
    predictions  = torch.argmax(text_out.logits, dim=2)[0].cpu().numpy()
    pred_labels  = [text_model.config.id2label[p] for p in predictions]

    for label, word_id in zip(pred_labels, word_ids):
        if word_id is None or word_id == prev_word_id:
            continue
        word = tokens[word_id]
        if label == 'B-Toxic':
            if current_span:
                toxic_spans.append(current_span)
            current_span = [word]
        elif label == 'I-Toxic' and current_span:
            current_span.append(word)
        else:
            if current_span:
                toxic_spans.append(current_span)
                current_span = None
        prev_word_id = word_id

    if current_span:
        toxic_spans.append(current_span)

    # ============================================
    # FINAL OUTPUT
    # ============================================
    print(f"{'='*45}")
    print(f"🔍 Prediction  : {final_label}")
    print(f"📊 Confidence  : {confidence:.1f}%")
    print(f"🎵 Audio prob  : toxic={audio_probs[1]*100:.1f}% | non_toxic={audio_probs[0]*100:.1f}%")
    print(f"📝 Text prob   : toxic={text_probs[1]*100:.1f}%  | non_toxic={text_probs[0]*100:.1f}%")
    print(f"{'='*45}")

    if toxic_spans:
        print(f"⚠️  Toxic Spans Found:")
        for span in toxic_spans:
            print(f"   → '{' '.join(span)}'")
    else:
        print(f"✅ No toxic spans detected")

# ============================================
# TEST IT
# ============================================
import torchaudio

# Test with a known toxic sample
toxic_sample = df[df['label'] == 'toxic'].iloc[0]
test_path    = toxic_sample['audio_path']

predict_full_pipeline("/content/drive/MyDrive/14021.mp3")

🎙️ Audio: /content/drive/MyDrive/14021.mp3

📝 Text: اللہ تعالیٰ غوث پاک کو رتبہ عطا کی۔۔ اس کے لئے کیا مذہبی حوالہ پے۔

🔍 Prediction  : NON TOXIC
📊 Confidence  : 75.0%
🎵 Audio prob  : toxic=61.5% | non_toxic=38.5%
📝 Text prob   : toxic=0.7%  | non_toxic=99.3%
✅ No toxic spans detected


In [23]:
!pip install pydub openai-whisper -q
!apt-get install ffmpeg -q

import whisper
from pydub import AudioSegment
import torch
import torchaudio
import numpy as np
import os

# ============================================
# SETUP
# ============================================
whatsapp_files = [
    "/content/drive/MyDrive/WhatsApp Ptt 2026-02-21 at 5.25.51 PM.ogg",
    "/content/drive/MyDrive/WhatsApp Ptt 2026-02-21 at 5.28.26 PM.ogg",
    "/content/drive/MyDrive/WhatsApp Ptt 2026-02-21 at 5.29.54 PM.ogg"
]

# Load Whisper for transcription
print("Loading Whisper...")
whisper_model = whisper.load_model("small")
print("✅ Whisper loaded!")

# ============================================
# REAL WORLD PIPELINE FUNCTION
# ============================================
def predict_realworld_audio(ogg_path):
    print(f"\n{'='*50}")
    print(f"🎙️  File: {os.path.basename(ogg_path)}")
    print(f"{'='*50}")

    # STEP 1: Convert OGG to MP3
    mp3_path = ogg_path.replace('.ogg', '_converted.mp3')
    audio    = AudioSegment.from_ogg(ogg_path)
    audio.export(mp3_path, format="mp3")
    print(f"✅ Converted to MP3")

    # STEP 2: Whisper Transcription
    result       = whisper_model.transcribe(mp3_path, language="ur", task="transcribe")
    transcription = result["text"].strip()
    print(f"📝 Transcription: {transcription}")

    if not transcription:
        print("❌ Empty transcription, skipping...")
        return

    tokens = transcription.split()

    # STEP 3: Audio Features (Wav2Vec2)
    waveform, sr = torchaudio.load(mp3_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform  = resampler(waveform)

    waveform = waveform.squeeze()
    if len(waveform) > 16000 * 10:
        waveform = waveform[:16000 * 10]

    inputs = processor(
        waveform.numpy(),
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    ).to(DEVICE)

    with torch.no_grad():
        outputs   = wav2vec_model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

    tensor      = torch.tensor(embedding, dtype=torch.float32).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        audio_out   = audio_classifier(tensor)
    audio_probs = torch.softmax(audio_out, dim=1).detach().cpu().numpy()[0]

    # STEP 4: Text Model (XLM-RoBERTa)
    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    word_ids     = encoding.word_ids(batch_index=0)
    model_inputs = {k: v.to(DEVICE) for k, v in encoding.items()}

    with torch.no_grad():
        text_out   = text_model(**model_inputs)
        logits     = text_out.logits[0]
        probs      = torch.softmax(logits, dim=-1)
        toxic_prob = probs[:, 1].max().item()
        text_probs = [1 - toxic_prob, toxic_prob]

    # STEP 5: Late Fusion
    combined    = 0.6 * np.array(text_probs) + 0.4 * np.array(audio_probs)
    final_pred  = combined.argmax()
    final_label = 'TOXIC' if final_pred == 1 else 'NON TOXIC'
    confidence  = combined[final_pred] * 100

    # STEP 6: Toxic Span Detection
    toxic_spans  = []
    current_span = None
    prev_word_id = None
    predictions  = torch.argmax(text_out.logits, dim=2)[0].cpu().numpy()
    pred_labels  = [text_model.config.id2label[p] for p in predictions]

    for label, word_id in zip(pred_labels, word_ids):
        if word_id is None or word_id == prev_word_id:
            continue
        if word_id >= len(tokens):
            continue
        word = tokens[word_id]
        if label == 'B-Toxic':
            if current_span:
                toxic_spans.append(current_span)
            current_span = [word]
        elif label == 'I-Toxic' and current_span:
            current_span.append(word)
        else:
            if current_span:
                toxic_spans.append(current_span)
                current_span = None
        prev_word_id = word_id

    if current_span:
        toxic_spans.append(current_span)

    # FINAL OUTPUT
    print(f"\n🔍 Prediction  : {final_label}")
    print(f"📊 Confidence  : {confidence:.1f}%")
    print(f"🎵 Audio prob  : toxic={audio_probs[1]*100:.1f}% | non_toxic={audio_probs[0]*100:.1f}%")
    print(f"📝 Text prob   : toxic={text_probs[1]*100:.1f}%  | non_toxic={text_probs[0]*100:.1f}%")

    if toxic_spans:
        print(f"\n⚠️  Toxic Spans:")
        for span in toxic_spans:
            print(f"   → '{' '.join(span)}'")
    else:
        print(f"\n✅ No toxic spans detected")

    # Cleanup converted file
    os.remove(mp3_path)

# ============================================
# RUN ON ALL 3 FILES
# ============================================
for f in whatsapp_files:
    predict_realworld_audio(f)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/803.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/803.2 kB[0m [31m12.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.3/188.3 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
Reading package lists...
Building dependency tree...
Reading state information...
ffmpeg is already the newest version

  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


Loading Whisper...


100%|███████████████████████████████████████| 461M/461M [00:05<00:00, 83.2MiB/s]


✅ Whisper loaded!

🎙️  File: WhatsApp Ptt 2026-02-21 at 5.25.51 PM.ogg
✅ Converted to MP3




📝 Transcription: کتی کے بچے بہت حرامی ہو تم

🔍 Prediction  : TOXIC
📊 Confidence  : 67.4%
🎵 Audio prob  : toxic=19.3% | non_toxic=80.7%
📝 Text prob   : toxic=99.5%  | non_toxic=0.5%

⚠️  Toxic Spans:
   → 'کتی کے بچے'
   → 'حرامی'

🎙️  File: WhatsApp Ptt 2026-02-21 at 5.28.26 PM.ogg
✅ Converted to MP3




📝 Transcription: علی ایک بہت اچھا بچا ہے وہ اپنا کام سیکرتا ہے

🔍 Prediction  : NON TOXIC
📊 Confidence  : 98.2%
🎵 Audio prob  : toxic=2.8% | non_toxic=97.2%
📝 Text prob   : toxic=1.2%  | non_toxic=98.8%

✅ No toxic spans detected

🎙️  File: WhatsApp Ptt 2026-02-21 at 5.29.54 PM.ogg
✅ Converted to MP3




📝 Transcription: یہ ایک انتہائی گھڈیا اور میج کسم کا جاہل انسان اور گدہ ہے

🔍 Prediction  : TOXIC
📊 Confidence  : 61.5%
🎵 Audio prob  : toxic=5.4% | non_toxic=94.6%
📝 Text prob   : toxic=98.8%  | non_toxic=1.2%

⚠️  Toxic Spans:
   → 'گھڈیا'
   → 'میج کسم کا'
   → 'جاہل انسان'
   → 'گدہ'
