In [1]:
!pip install transformers torchaudio librosa datasets -q

In [2]:
import os
import ast
import torch
import torchaudio
import librosa
import numpy as np
import pandas as pd
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

print("✅ All imports done")
print(f"✅ GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else '❌ No GPU!'}")

✅ All imports done
✅ GPU: Tesla T4


In [3]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/MyDrive/urdu_toxic_audio_dataset.csv")

print(f"✅ Loaded: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nLabel distribution:\n{df['label'].value_counts()}")

Mounted at /content/drive
✅ Loaded: (14337, 13)
Columns: ['id', 'text', 'label', 'sub_label', 'toxic_spans', 'tokens', 'toxic_list', 'BIO_tags', 'audio_path', 'transcription', 'new_tokens', 'new_BIO_tags', 'toxic_preserved']

Label distribution:
label
toxic        7751
non_toxic    6586
Name: count, dtype: int64


In [4]:
MODEL_NAME = "facebook/wav2vec2-base"  # lightweight, no Urdu specific needed for embeddings

print("Loading Wav2Vec2...")
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
wav2vec_model = Wav2Vec2Model.from_pretrained(MODEL_NAME)
wav2vec_model = wav2vec_model.to('cuda')
wav2vec_model.eval()

print("✅ Wav2Vec2 loaded!")

Loading Wav2Vec2...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/211 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base
Key                          | Status     |  | 
-----------------------------+------------+--+-
quantizer.weight_proj.bias   | UNEXPECTED |  | 
project_hid.bias             | UNEXPECTED |  | 
project_q.bias               | UNEXPECTED |  | 
project_q.weight             | UNEXPECTED |  | 
quantizer.codevectors        | UNEXPECTED |  | 
quantizer.weight_proj.weight | UNEXPECTED |  | 
project_hid.weight           | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


✅ Wav2Vec2 loaded!


In [5]:
def extract_audio_features(audio_path, processor, model, max_length=16000*10):
    """
    Extract Wav2Vec2 embeddings from audio file.
    Returns a fixed-size embedding vector.
    """
    try:
        # Load audio
        waveform, sr = torchaudio.load(audio_path)

        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Resample to 16kHz
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            waveform  = resampler(waveform)

        # Truncate or pad to max_length (10 seconds)
        waveform = waveform.squeeze()
        if len(waveform) > max_length:
            waveform = waveform[:max_length]

        # Process
        inputs = processor(
            waveform.numpy(),
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        ).to('cuda')

        with torch.no_grad():
            outputs = wav2vec_model(**inputs)

        # Mean pool across time → fixed size vector (768,)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        return embedding

    except Exception as e:
        print(f"Error: {audio_path} → {e}")
        return None

print("✅ Feature extraction function defined")

✅ Feature extraction function defined


In [6]:
embeddings = []
valid_indices = []

total = len(df)
print(f"Extracting features for {total} rows...\n")

for idx, row in df.iterrows():
    emb = extract_audio_features(row['audio_path'], processor, wav2vec_model)

    if emb is not None:
        embeddings.append(emb)
        valid_indices.append(idx)

    if idx % 500 == 0:
        print(f"✅ {idx}/{total} done")

print(f"\n✅ Done! Extracted: {len(embeddings)} | Failed: {total - len(embeddings)}")

# Convert to numpy array
embeddings = np.array(embeddings)
print(f"Embeddings shape: {embeddings.shape}")  # should be (14000, 768)

# Save embeddings
np.save("/content/drive/MyDrive/audio_embeddings.npy", embeddings)
np.save("/content/drive/MyDrive/audio_valid_indices.npy", np.array(valid_indices))
print("✅ Embeddings saved to Drive!")

Extracting features for 14337 rows...

✅ 0/14337 done
✅ 500/14337 done
✅ 1000/14337 done
Error: /content/drive/MyDrive/urdu_toxic_audio _og/11369.mp3 → Failed to create AudioDecoder for /content/drive/MyDrive/urdu_toxic_audio _og/11369.mp3: Could not open input file: /content/drive/MyDrive/urdu_toxic_audio _og/11369.mp3 Invalid argument
Error: /content/drive/MyDrive/urdu_toxic_audio _og/11370.mp3 → Failed to create AudioDecoder for /content/drive/MyDrive/urdu_toxic_audio _og/11370.mp3: Could not open input file: /content/drive/MyDrive/urdu_toxic_audio _og/11370.mp3 Invalid argument
Error: /content/drive/MyDrive/urdu_toxic_audio _og/11371.mp3 → Failed to create AudioDecoder for /content/drive/MyDrive/urdu_toxic_audio _og/11371.mp3: Could not open input file: /content/drive/MyDrive/urdu_toxic_audio _og/11371.mp3 Invalid argument
Error: /content/drive/MyDrive/urdu_toxic_audio _og/11373.mp3 → Failed to create AudioDecoder for /content/drive/MyDrive/urdu_toxic_audio _og/11373.mp3: Could not

In [7]:
# Filter df to only valid rows
df_valid = df.iloc[valid_indices].reset_index(drop=True)

# Encode labels (toxic=1, non-toxic=0)
le = LabelEncoder()
labels = le.fit_transform(df_valid['label'])

print(f"Classes: {le.classes_}")
print(f"Label distribution: {np.bincount(labels)}")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print(f"\nTrain: {X_train.shape} | Test: {X_test.shape}")

Classes: ['non_toxic' 'toxic']
Label distribution: [6450 7751]

Train: (11360, 768) | Test: (2841, 768)


In [8]:
class AudioToxicClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256, num_classes=2):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.classifier(x)


class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.X = torch.tensor(embeddings, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Create datasets
train_dataset = EmbeddingDataset(X_train, y_train)
test_dataset  = EmbeddingDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False)

# Init model
audio_classifier = AudioToxicClassifier().to('cuda')
optimizer = torch.optim.Adam(audio_classifier.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

print("✅ Model ready!")
print(audio_classifier)

✅ Model ready!
AudioToxicClassifier(
  (classifier): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=2, bias=True)
  )
)


In [9]:
EPOCHS = 15

for epoch in range(EPOCHS):
    # Training
    audio_classifier.train()
    train_loss = 0
    correct    = 0
    total_     = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to('cuda'), y_batch.to('cuda')

        optimizer.zero_grad()
        outputs = audio_classifier(X_batch)
        loss    = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        correct    += (outputs.argmax(1) == y_batch).sum().item()
        total_     += len(y_batch)

    train_acc = correct / total_ * 100

    # Evaluation
    audio_classifier.eval()
    val_correct = 0
    val_total   = 0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to('cuda'), y_batch.to('cuda')
            outputs = audio_classifier(X_batch)
            val_correct += (outputs.argmax(1) == y_batch).sum().item()
            val_total   += len(y_batch)

    val_acc = val_correct / val_total * 100
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {train_loss/len(train_loader):.4f} | Train Acc: {train_acc:.1f}% | Val Acc: {val_acc:.1f}%")

Epoch 1/15 | Loss: 0.6683 | Train Acc: 59.8% | Val Acc: 63.6%
Epoch 2/15 | Loss: 0.6075 | Train Acc: 65.0% | Val Acc: 65.5%
Epoch 3/15 | Loss: 0.5779 | Train Acc: 67.9% | Val Acc: 69.2%
Epoch 4/15 | Loss: 0.5649 | Train Acc: 69.2% | Val Acc: 70.4%
Epoch 5/15 | Loss: 0.5546 | Train Acc: 70.4% | Val Acc: 70.1%
Epoch 6/15 | Loss: 0.5510 | Train Acc: 70.3% | Val Acc: 71.3%
Epoch 7/15 | Loss: 0.5443 | Train Acc: 71.3% | Val Acc: 71.2%
Epoch 8/15 | Loss: 0.5425 | Train Acc: 71.7% | Val Acc: 67.9%
Epoch 9/15 | Loss: 0.5361 | Train Acc: 72.0% | Val Acc: 71.4%
Epoch 10/15 | Loss: 0.5329 | Train Acc: 72.2% | Val Acc: 72.7%
Epoch 11/15 | Loss: 0.5319 | Train Acc: 72.4% | Val Acc: 72.6%
Epoch 12/15 | Loss: 0.5291 | Train Acc: 72.4% | Val Acc: 72.6%
Epoch 13/15 | Loss: 0.5254 | Train Acc: 72.9% | Val Acc: 72.1%
Epoch 14/15 | Loss: 0.5264 | Train Acc: 72.7% | Val Acc: 73.2%
Epoch 15/15 | Loss: 0.5220 | Train Acc: 73.1% | Val Acc: 72.2%


In [10]:
from sklearn.metrics import classification_report, f1_score

audio_classifier.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to('cuda')
        outputs = audio_classifier(X_batch)
        preds   = outputs.argmax(1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

print("📊 Audio Model Performance:")
print(classification_report(all_labels, all_preds, target_names=le.classes_))
print(f"F1 Score: {f1_score(all_labels, all_preds, average='weighted'):.4f}")

📊 Audio Model Performance:
              precision    recall  f1-score   support

   non_toxic       0.86      0.46      0.60      1290
       toxic       0.68      0.94      0.79      1551

    accuracy                           0.72      2841
   macro avg       0.77      0.70      0.69      2841
weighted avg       0.76      0.72      0.70      2841

F1 Score: 0.7018


In [11]:
torch.save(audio_classifier.state_dict(), "/content/drive/MyDrive/audio_toxic_classifier.pt")
print("✅ Audio classifier saved!")

✅ Audio classifier saved!


In [13]:
# Load saved model and check predictions on a few samples
audio_classifier.eval()

# Take 10 samples from test set
sample_embeddings = torch.tensor(X_test[:10], dtype=torch.float32).to('cuda')
sample_labels     = y_test[:10]

with torch.no_grad():
    outputs      = audio_classifier(sample_embeddings)
    probs        = torch.softmax(outputs, dim=1).cpu().numpy()
    predictions  = outputs.argmax(1).cpu().numpy()

# Show results
print("=== Sanity Check: 10 Test Samples ===\n")
for i in range(10):
    actual    = le.classes_[sample_labels[i]]
    predicted = le.classes_[predictions[i]]
    confidence = probs[i][predictions[i]] * 100
    correct   = "✅" if actual == predicted else "❌"

    print(f"{correct} Actual: {actual:12} | Predicted: {predicted:12} | Confidence: {confidence:.1f}%")


=== Sanity Check: 10 Test Samples ===

✅ Actual: toxic        | Predicted: toxic        | Confidence: 81.7%
✅ Actual: toxic        | Predicted: toxic        | Confidence: 68.6%
❌ Actual: toxic        | Predicted: non_toxic    | Confidence: 63.6%
✅ Actual: toxic        | Predicted: toxic        | Confidence: 73.7%
✅ Actual: toxic        | Predicted: toxic        | Confidence: 72.9%
✅ Actual: toxic        | Predicted: toxic        | Confidence: 74.8%
✅ Actual: toxic        | Predicted: toxic        | Confidence: 83.5%
❌ Actual: non_toxic    | Predicted: toxic        | Confidence: 64.3%
✅ Actual: toxic        | Predicted: toxic        | Confidence: 72.7%
✅ Actual: toxic        | Predicted: toxic        | Confidence: 70.9%


In [18]:
audio_file = "/content/drive/MyDrive/11693.mp3"

import IPython.display as ipd
ipd.display(ipd.Audio(audio_file))

# Extract Wav2Vec2 features (same as training)
emb = extract_audio_features(audio_file, processor, wav2vec_model)

if emb is not None:
    audio_classifier.eval()
    tensor = torch.tensor(emb, dtype=torch.float32).unsqueeze(0).to('cuda')

    with torch.no_grad():
        output = audio_classifier(tensor)
        probs  = torch.softmax(output, dim=1).cpu().numpy()[0]
        pred   = output.argmax(1).cpu().numpy()[0]

    label      = le.classes_[pred]
    confidence = probs[pred] * 100

    print(f"\n{'='*40}")
    print(f"🔍 Prediction : {label.upper()}")
    print(f"📊 Confidence : {confidence:.1f}%")
    print(f"{'='*40}")
    print(f"  toxic     probability: {probs[list(le.classes_).index('toxic')]*100:.1f}%")
    print(f"  non_toxic probability: {probs[list(le.classes_).index('non_toxic')]*100:.1f}%")

else:
    print("❌ Feature extraction failed")


🔍 Prediction : TOXIC
📊 Confidence : 71.3%
  toxic     probability: 71.3%
  non_toxic probability: 28.7%


In [19]:
audio_file = "/content/drive/MyDrive/14021.mp3"

import IPython.display as ipd
ipd.display(ipd.Audio(audio_file))

# Extract Wav2Vec2 features (same as training)
emb = extract_audio_features(audio_file, processor, wav2vec_model)

if emb is not None:
    audio_classifier.eval()
    tensor = torch.tensor(emb, dtype=torch.float32).unsqueeze(0).to('cuda')

    with torch.no_grad():
        output = audio_classifier(tensor)
        probs  = torch.softmax(output, dim=1).cpu().numpy()[0]
        pred   = output.argmax(1).cpu().numpy()[0]

    label      = le.classes_[pred]
    confidence = probs[pred] * 100

    print(f"\n{'='*40}")
    print(f"🔍 Prediction : {label.upper()}")
    print(f"📊 Confidence : {confidence:.1f}%")
    print(f"{'='*40}")
    print(f"  toxic     probability: {probs[list(le.classes_).index('toxic')]*100:.1f}%")
    print(f"  non_toxic probability: {probs[list(le.classes_).index('non_toxic')]*100:.1f}%")

else:
    print("❌ Feature extraction failed")


🔍 Prediction : TOXIC
📊 Confidence : 61.5%
  toxic     probability: 61.5%
  non_toxic probability: 38.5%
