## Environment Setup

In [None]:
import os
import pandas as pd
import librosa # extract audio features
import numpy as np
from transformers import AutoTokenizer # to tokenize text
import tensorflow as tf
from sklearn.metrics import f1_score # evaluation
from sklearn.model_selection import train_test_split
import random

import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
root_dir = '/content/drive/MyDrive/IndiaAI/data/DravidianLangTech2025_MultimodalHateSpeech'

# Data Processing

In [None]:
# initialize text tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART")

# Function to extract MFCCs
def extract_mfcc(audio_file, language, dir, sr=44100, n_mfcc=40, max_len=128):
    file_path = f"{root_dir}/{language}/{language}_{dir}/audio/{audio_file}.wav"
    y, sr = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc.T

# Model Components and Definition

In [None]:
# Positional Encoder (Tensorflow Transformer Tutorial)
def positional_encoding(length, depth):
  depth = depth/2
  positions = np.arange(length)[:, np.newaxis]
  depths = np.arange(depth)[np.newaxis, :]/depth
  angle_rates = 1 / (10000**depths)
  angle_rads = positions * angle_rates
  pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)
  return tf.cast(pos_encoding, dtype=tf.float32)


class PositionalEncoder(tf.keras.layers.Layer):
  def __init__(self,d_model):
    super().__init__()
    self.d_model = d_model
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)


  def call(self, x):
    length = tf.shape(x)[1]
    x = self.pos_encoding[tf.newaxis, :length, :]
    return x


In [None]:
# Speech and Text Sampling layers  - Inputs to Transformer modules
class SpeechSampling(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(SpeechSampling, self).__init__()
        self.lstm = tf.keras.layers.LSTM(d_model, return_sequences=True)
        self.positional_encoder = PositionalEncoder(d_model=d_model)

    def call(self, x):
        lstm_out = self.lstm(x)
        pos_enc = self.positional_encoder(x)
        return lstm_out + pos_enc

class TextSampling(tf.keras.layers.Layer):
    def __init__(self, d_model, vocab_size):
        super(TextSampling, self).__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.positional_encoder = PositionalEncoder(d_model=d_model)

    def call(self, x):
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.positional_encoder(x)
        return x


In [None]:
# Attention layers (Self Attn, Cross Attn and Causal Attn) - Tensorflow NMT Transformer Tutorial
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()


class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(query=x, key=context, value=context, return_attention_scores=True)
    self.last_attn_scores = attn_scores
    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(query=x, value=x, key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(query=x, value=x, key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x


In [None]:
# Feedforward Layer
class Sequential_Sampling(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate, name="Sequential_Sampling", **kwargs):
    super(Sequential_Sampling, self).__init__(name=name, **kwargs)
    self.d_model = d_model
    self.dff = dff
    self.dropout_rate = dropout_rate

    self.dense1 = tf.keras.layers.Dense(dff, activation='relu')
    self.dense2 = tf.keras.layers.Dense(d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)


  def call(self, x):
    x = self.dense1(x)
    x = self.dense2(x)
    x = self.dropout(x)
    return x


class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.d_model = d_model
    self.dff = dff
    self.dropout_rate = dropout_rate

    self.seq = Sequential_Sampling(d_model, dff, dropout_rate)
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):

    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x


In [None]:
# The Encoder
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)
    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x


class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):

    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x

In [None]:
# The Decoder
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)
    self.cross_attention = CrossAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)
    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)
    return x


class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores
    return x


In [None]:
# Model Definition
class BaseClassifier_1(tf.keras.models.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate, **kwargs):
        super(BaseClassifier_1, self).__init__()

        self.speech_pre = SpeechSampling(d_model=d_model)
        self.text_pre = TextSampling(d_model=d_model, vocab_size=target_vocab_size)

        self.encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
        self.decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)

    def call(self, inputs):
        speech_features, text_features = inputs

        # Speech features processed by encoder
        speech_processed = self.speech_pre(speech_features)
        # Text features processed by decoder using speech context
        text_processed = self.text_pre(text_features)

        encoded_speech = self.encoder(speech_processed)
        decoded_text = self.decoder(text_processed, encoded_speech)

        return decoded_text


class BaseClassifier_2(tf.keras.models.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate, **kwargs):
        super(BaseClassifier_2, self).__init__()

        self.speech_pre = SpeechSampling(d_model=d_model)
        self.text_pre = TextSampling(d_model=d_model, vocab_size=target_vocab_size)

        self.encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
        self.decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)

    def call(self, inputs):
        speech_features, text_features = inputs

        # Text features processed by encoder
        text_processed = self.text_pre(text_features)
        # Speech features processed by decoder using text context
        speech_processed = self.speech_pre(speech_features)

        encoded_text = self.encoder(text_processed)
        decoded_speech = self.decoder(speech_processed, encoded_text)

        return decoded_speech


class Classifier(tf.keras.models.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, num_classes, dropout_rate, **kwargs):
        super(Classifier, self).__init__()

        self.base_classifier_1 = BaseClassifier_1(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, dropout_rate=dropout_rate)
        self.base_classifier_2 = BaseClassifier_2(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, dropout_rate=dropout_rate)

        self.lstm_1 = tf.keras.layers.LSTM(units=d_model, return_sequences=True, dropout=dropout_rate)
        self.lstm_2 = tf.keras.layers.LSTM(units=d_model, return_sequences=True, dropout=dropout_rate)

        self.global_pool_1 = tf.keras.layers.GlobalAveragePooling1D()
        self.global_pool_2 = tf.keras.layers.GlobalAveragePooling1D()

        self.concat = tf.keras.layers.Concatenate(axis=-1)
        self.final_layer = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        speech_features, text_features = inputs

        # Pipeline 1
        out1 = self.base_classifier_1([speech_features, text_features])

        # Pipeline 2
        out2 = self.base_classifier_2([speech_features, text_features])

        # LSTM layers
        lstm_out1 = self.lstm_1(out1)
        lstm_out2 = self.lstm_2(out2)

        # Global pooling
        pooled_out1 = self.global_pool_1(lstm_out1)
        pooled_out2 = self.global_pool_2(lstm_out2)

        # Concatenate and classify
        concatenated = self.concat([pooled_out1, pooled_out2])
        return self.final_layer(concatenated)



# Multimodal Hate Speech Detection in Tamil

In [None]:
data = pd.read_excel(f'{root_dir}/tamil/tamil_train/text/TA-AT-train.xlsx')
data.head()

Unnamed: 0,Class Label Short,File Name,Transcript
0,C,H_TA_003_C_M_016_005,உருவத்தை வச்ச ஒருத்தன் கிண்டல் பண்றான் பாருங்க...
1,C,H_TA_003_C_M_016_004,காமெடி பண்ண சொன்னா ஒருத்தன உருவ கேலி பண்ணிட்டு...
2,C,H_TA_003_C_M_016_003,இந்த உருவத்தை வைத்து கிண்டல் கேலி பண்ணி சிரிக்...
3,C,H_TA_003_C_M_015_002,புரிஞ்சுக்கணும் மேடையில் ஒரு நாகரிகம்னு ஒன்னு ...
4,C,H_TA_003_C_M_015_001,என்னா மல மல அண்ணாமலை இது உலகத்தோட ஸ்டைலு உட்கா...


In [None]:
data['mfccs'] = data['File Name'].apply(lambda x: extract_mfcc(audio_file = x, language='tamil', dir='train'))
data['tokenized_text'] = data['Transcript'].apply(
    lambda x: tokenizer(f"{x}", return_tensors="np", padding='max_length', truncation=True, max_length=128)
)

In [None]:
data['input_ids'] = data['tokenized_text'].apply(lambda x: x['input_ids'][0])
class_mapping = {'G': 0, 'P': 1, 'R': 2, 'C': 3, 'N': 4}
data['labels'] = data['Class Label Short'].map(class_mapping)

In [None]:
# Split the dataframe into train-val-test sets: 80-10-10 split
from sklearn.model_selection import train_test_split

train, val = train_test_split(data, test_size=0.1, stratify=data['labels'])

print("Original Train shape: ", train.shape)
print("Original Val shape: ", val.shape)

Original Train shape:  (458, 7)
Original Val shape:  (51, 7)


In [None]:
train_inputs = [np.array(list(train['mfccs'])), np.array(list(train['input_ids']))]
val_inputs = [np.array(list(val['mfccs'])), np.array(list(val['input_ids']))]
train_labels = np.array(list(train['labels']))
val_labels = np.array(list(val['labels']))


In [None]:
# Initialize the model
model = Classifier(
    num_layers=2,
    d_model=128,
    num_heads=4,
    dff=256,
    input_vocab_size=64000,
    target_vocab_size=64000,
    num_classes=5,
    dropout_rate=0.1
)

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='sparse_categorical_crossentropy',  # Use categorical_crossentropy since labels are one-hot encoded
    metrics=['accuracy']
)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                              patience=3)

# Train the model
history = model.fit(
    train_inputs,
    train_labels,
    validation_data=(val_inputs, val_labels),
    epochs=20,
    callbacks=[callback],
    batch_size=32
)

Epoch 1/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 310ms/step - accuracy: 0.5015 - loss: 1.4323 - val_accuracy: 0.5686 - val_loss: 1.2582
Epoch 2/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 153ms/step - accuracy: 0.5670 - loss: 1.2814 - val_accuracy: 0.5686 - val_loss: 1.2786
Epoch 3/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 159ms/step - accuracy: 0.5597 - loss: 1.2852 - val_accuracy: 0.5686 - val_loss: 1.2559
Epoch 4/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 164ms/step - accuracy: 0.5626 - loss: 1.2573 - val_accuracy: 0.5686 - val_loss: 1.2250
Epoch 5/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 160ms/step - accuracy: 0.5597 - loss: 1.2516 - val_accuracy: 0.5686 - val_loss: 1.2163
Epoch 6/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 154ms/step - accuracy: 0.5688 - loss: 1.2150 - val_accuracy: 0.5686 - val_loss: 1.2144
Epoch 7/20
[1m15/15[0m [

In [None]:
from sklearn.metrics import f1_score
# Make predictions on the test set
pred_probs = model.predict(val_inputs)
pred_labels = np.argmax(pred_probs, axis=-1)  # Convert probabilities to class labels

# Compute macro-averaged F1-score
macro_f1 = f1_score(val_labels, pred_labels, average='macro')
print(f"Macro-Averaged F1-Score: {macro_f1}")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3s/step
Macro-Averaged F1-Score: 0.28152173913043477


In [None]:
# Prepare test set
test = pd.read_excel(f'{root_dir}/tamil/tamil_test/text/TA-AT-test.xlsx')

test['mfccs'] = test['File Name'].apply(lambda x: extract_mfcc(audio_file = x, language='tamil', dir='test'))
test['tokenized_text'] = test['Transcript'].apply(lambda x: tokenizer(x, return_tensors="np", padding='max_length', truncation=True, max_length=128))
test['labels'] = test['Class Label '].map(class_mapping)
test['input_ids'] = test['tokenized_text'].apply(lambda x: x['input_ids'][0])


test_inputs = [np.array(list(test['mfccs'])), np.array(list(test['input_ids']))]
test_labels = np.array(list(test['labels']))

In [None]:
# Evaluate the model on test set
test_loss, test_accuracy = model.evaluate(test_inputs, test_labels)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.5770 - loss: 1.1866
Test Loss: 1.2130273580551147, Test Accuracy: 0.5686274766921997


In [None]:
from sklearn.metrics import f1_score
# Make predictions on the test set
pred_probs = model.predict(test_inputs)
pred_labels = np.argmax(pred_probs, axis=-1)  # Convert probabilities to class labels

# Compute macro-averaged F1-score
macro_f1 = f1_score(test_labels, pred_labels, average='macro')
print(f"Macro-Averaged F1-Score: {macro_f1}")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
Macro-Averaged F1-Score: 0.12715789473684208


# Multimodal Hate Speech Detection in Malayalam

In [None]:
data = pd.read_excel(f'{root_dir}/malayalam/malayalam_train/text/ML-AT-train.xlsx')
data.head()

Unnamed: 0,Class Label Short,File Name,Transcript
0,C,H_ML_001_C_F_044_001,നമസ്കാരം ഒരു ഒരു പരമ ചെറ്റയുടെ കാര്യമാണ് ഞാൻ പ...
1,C,H_ML_001_C_F_044_002,ആദ്യം തന്നെ അവൻറെ ഐഡിയുടെ പേര് വരെ ഞാൻ ഇതിനകത്...
2,C,H_ML_001_C_F_044_003,അവൻറെ ആ ചെറ്റയുടെ ആ പരമനാറിയുടെ പേര്
3,C,H_ML_001_C_F_044_004,അവന്റെ ദുഷിച്ച മനസ്സ് കൊണ്ടുവന്ന് എൻറെ വീഡിയോയ...
4,C,H_ML_001_C_F_044_005,നിൻറെ ദുഷിപ്പ് എല്ലാം എന്തിനാ എന്റെ നേർക്ക് തീ...


In [None]:
data['mfccs'] = data['File Name'].apply(lambda x: extract_mfcc(audio_file = x, language='malayalam', dir='train'))

data['tokenized_text'] = data['Transcript'].apply(
    lambda x: tokenizer(f"{x}", return_tensors="np", padding='max_length', truncation=True, max_length=128)
)

In [None]:
data['input_ids'] = data['tokenized_text'].apply(lambda x: x['input_ids'][0])

In [None]:
# Perform class mapping
class_mapping = {'G': 0, 'P': 1, 'R': 2, 'C': 3, 'N': 4}
data['labels'] = data['Class Label Short'].map(class_mapping)

In [None]:
# Split the dataframe into train-val-test sets: 80-10-10 split
from sklearn.model_selection import train_test_split

train, val = train_test_split(data, test_size=0.1, stratify=data['labels'])

print("Original Train shape: ", train.shape)
print("Original Val shape: ", val.shape)

Original Train shape:  (794, 7)
Original Val shape:  (89, 7)


In [None]:
train_inputs = [np.array(list(train['mfccs'])), np.array(list(train['input_ids']))]
val_inputs = [np.array(list(val['mfccs'])), np.array(list(val['input_ids']))]
train_labels = np.array(list(train['labels']))
val_labels = np.array(list(val['labels']))

In [None]:
# Initialize the model
model = Classifier(
    num_layers=2,
    d_model=128,
    num_heads=4,
    dff=256,
    input_vocab_size=64000,
    target_vocab_size=64000,
    num_classes=5,
    dropout_rate=0.1
)

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='sparse_categorical_crossentropy',  # Use categorical_crossentropy since labels are one-hot encoded
    metrics=['accuracy']
)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                              patience=3)

In [None]:
# Train the model
history = model.fit(
    train_inputs,
    train_labels,
    validation_data=(val_inputs, val_labels),
    epochs=20,
    callbacks=[callback],
    batch_size=32
)

Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 260ms/step - accuracy: 0.4534 - loss: 1.4274 - val_accuracy: 0.5056 - val_loss: 1.2377
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 129ms/step - accuracy: 0.6047 - loss: 1.1161 - val_accuracy: 0.6854 - val_loss: 0.9536
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 129ms/step - accuracy: 0.6725 - loss: 0.9296 - val_accuracy: 0.6966 - val_loss: 0.8463
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 129ms/step - accuracy: 0.7350 - loss: 0.7813 - val_accuracy: 0.7079 - val_loss: 0.8066
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 130ms/step - accuracy: 0.7387 - loss: 0.7215 - val_accuracy: 0.7079 - val_loss: 0.8271
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 129ms/step - accuracy: 0.7633 - loss: 0.6508 - val_accuracy: 0.7528 - val_loss: 0.7393
Epoch 7/20
[1m25/25[0m [

In [None]:
# Evaluate the model
val_loss, val_accuracy = model.evaluate(val_inputs, val_labels)
print(f"Val Loss: {val_loss}, Val Accuracy: {val_accuracy}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.7397 - loss: 0.6414
Val Loss: 0.6260916590690613, Val Accuracy: 0.7528089880943298


In [None]:
from sklearn.metrics import f1_score
# Make predictions on the test set
pred_probs = model.predict(val_inputs)
pred_labels = np.argmax(pred_probs, axis=-1)  # Convert probabilities to class labels

# Compute macro-averaged F1-score
macro_f1 = f1_score(val_labels, pred_labels, average='macro')
print(f"Macro-Averaged F1-Score: {macro_f1}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step
Macro-Averaged F1-Score: 0.6757426569021183


In [None]:
# Prepare test set
test = pd.read_excel(f'{root_dir}/malayalam/malayalam_test/text/ML-AT-test.xlsx')

test['mfccs'] = test['File Name'].apply(lambda x: extract_mfcc(audio_file=x, language='malayalam', dir='test'))

# vocab_size = 64000
test['tokenized_text'] = test['Transcript'].apply(
    lambda x: tokenizer(f"{x}", return_tensors="np", padding='max_length', truncation=True, max_length=128)
)

test['input_ids'] = test['tokenized_text'].apply(lambda x: x['input_ids'][0])

In [None]:
test['labels'] = test['Class Label'].map(class_mapping)
test.columns

Index(['File Name', 'Transcript', 'Class Label', 'mfccs', 'tokenized_text',
       'input_ids', 'labels'],
      dtype='object')

In [None]:
test_data_inputs = [np.array(list(test['mfccs'])), np.array(list(test['input_ids']))]
test_labels = np.array(list(test['labels']))

In [None]:
pred_probs = model.predict(test_data_inputs)
pred_labels = np.argmax(pred_probs, axis=-1)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 512ms/step


In [None]:
from sklearn.metrics import f1_score
# Make predictions on the test set
pred_probs = model.predict(test_data_inputs)
pred_labels = np.argmax(pred_probs, axis=-1)  # Convert probabilities to class labels

# Compute macro-averaged F1-score
macro_f1 = f1_score(test_labels, pred_labels, average='macro')
print(f"Macro-Averaged F1-Score: {macro_f1}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
Macro-Averaged F1-Score: 0.4321491228070175


# Multimodal Hate Speech Detection in Telugu


In [None]:
data = pd.read_excel(f'{root_dir}/telugu/telugu_train/text/TE-AT-train.xlsx')
data.head()

Unnamed: 0,Class Label Short,File_Name,Transcript
0,R,H_TE_001_R_F_015_001,ఈ కాలంలో మీరు ఒకసారి ఒబ్సర్వ్ చేయండి మన స్టేటు...
1,R,H_TE_001_R_F_015_002,హిందూ థర్మాన్ని ఎవరేమన్నా కూడా వాడికొచ్చే ఒకేఒ...
2,R,H_TE_001_R_F_015_003,ఒక ముస్లింనిగాని. ఒక్క నిమిషం భార్గవి కల్యాణిగ...
3,R,H_TE_001_R_F_015_004,ఈ సెక్యులర్ ఇండియా డెమోక్రాటిక్ ఇండియా అని మాట...
4,R,H_TE_001_R_F_015_005,"హిందువులున్న ఈ భారతదేశంలో ,ఈ భారతదేశంలో , సనాత..."


In [None]:
data.rename(columns={'File_Name': 'File Name'}, inplace=True)
data['mfccs'] = data['File Name'].apply(lambda x: extract_mfcc(audio_file = x, language='telugu', dir='train'))
data['tokenized_text'] = data['Transcript'].apply(
    lambda x: tokenizer(f"{x}", return_tensors="np", padding='max_length', truncation=True, max_length=128)
)

In [None]:
data['labels'] = data['Class Label Short'].map(class_mapping)

In [None]:
# Split the dataframe into train-val-test sets: 80-10-10 split
from sklearn.model_selection import train_test_split
train, val = train_test_split(data, test_size=0.1, stratify=data['labels'])
print("Original Train shape: ", train.shape)
print("Original Val shape: ", val.shape)

Original Train shape:  (495, 6)
Original Val shape:  (56, 6)


In [None]:
train['input_ids'] = train['tokenized_text'].apply(lambda x: x['input_ids'][0])
val['input_ids'] = val['tokenized_text'].apply(lambda x: x['input_ids'][0])

In [None]:
train_inputs = [np.array(list(train['mfccs'])), np.array(list(train['input_ids']))]
val_inputs = [np.array(list(val['mfccs'])), np.array(list(val['input_ids']))]

train_labels = np.array(list(train['labels']))
val_labels = np.array(list(val['labels']))


In [None]:
# Initialize the model
model = Classifier(
    num_layers=2,
    d_model=128,
    num_heads=4,
    dff=256,
    input_vocab_size=64000,
    target_vocab_size=64000,
    num_classes=5,
    dropout_rate=0.1
)

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='sparse_categorical_crossentropy',  # Use categorical_crossentropy since labels are one-hot encoded
    metrics=['accuracy']
)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                              patience=3)
# Train the model
history = model.fit(
    train_inputs,
    train_labels,
    validation_data=(val_inputs, val_labels),
    epochs=20,
    callbacks=[callback],
    batch_size=32
)

Epoch 1/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 606ms/step - accuracy: 0.3564 - loss: 1.5516 - val_accuracy: 0.4643 - val_loss: 1.4478
Epoch 2/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 301ms/step - accuracy: 0.4666 - loss: 1.3395 - val_accuracy: 0.4464 - val_loss: 1.3927
Epoch 3/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 302ms/step - accuracy: 0.4982 - loss: 1.2867 - val_accuracy: 0.4464 - val_loss: 1.3973
Epoch 4/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 285ms/step - accuracy: 0.5313 - loss: 1.2216 - val_accuracy: 0.4821 - val_loss: 1.4230
Epoch 5/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 300ms/step - accuracy: 0.5308 - loss: 1.1781 - val_accuracy: 0.5357 - val_loss: 1.3000
Epoch 6/20
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 304ms/step - accuracy: 0.5951 - loss: 1.0808 - val_accuracy: 0.5000 - val_loss: 1.3021
Epoch 7/20
[1m16/16[0m 

In [None]:
val_loss, val_accuracy = model.evaluate(val_inputs, val_labels)
print(f"Val Loss: {val_loss}, Val Accuracy: {val_accuracy}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - accuracy: 0.6354 - loss: 1.1317
Val Loss: 1.1943482160568237, Val Accuracy: 0.625


In [None]:
from sklearn.metrics import f1_score
# Make predictions on the test set
pred_probs = model.predict(val_inputs)
pred_labels = np.argmax(pred_probs, axis=-1)  # Convert probabilities to class labels

# Compute macro-averaged F1-score
macro_f1 = f1_score(val_labels, pred_labels, average='macro')
print(f"Macro-Averaged F1-Score: {macro_f1}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
Macro-Averaged F1-Score: 0.5146301067353699


In [None]:
test = f'{root_dir}/telugu/telugu_test/text/TE-AT-test.xlsx'
test = pd.read_excel(test)

test.rename(columns={'File_Name': 'File Name'}, inplace=True)
test['mfccs'] = test['File Name'].apply(lambda x: extract_mfcc(audio_file = x, language='telugu', dir='test'))

test['tokenized_text'] = test['Transcript'].apply(lambda x: tokenizer(x, return_tensors="np", padding='max_length', truncation=True, max_length=128))
test['labels'] = test['Class Label '].map(class_mapping)
test['input_ids'] = test['tokenized_text'].apply(lambda x: x['input_ids'][0])

In [None]:
test.head()

Unnamed: 0,File Name,Transcript,Class Label,mfccs,tokenized_text,labels,input_ids
0,TE_TE_001,ఎస్సీలుగా పుట్టాలని ఎవరు కోరుకుంటారు,R,"[[-538.6989, 0.28734857, 0.24082115, 0.1740721...","[input_ids, token_type_ids, attention_mask]",2,"[2, 41, 61952, 59863, 59550, 59863, 60571, 597..."
1,TE_TE_002,ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు అం...,R,"[[-548.9849, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[input_ids, token_type_ids, attention_mask]",2,"[2, 41, 61952, 60708, 59830, 59758, 41, 60465,..."
2,TE_TE_003,హిందువల కోసం కట్టిన పుష్కరఘాట్లో పబ్లిక్ గ మతమ...,R,"[[-513.0835, 63.628677, 49.77363, 35.49523, 25...","[input_ids, token_type_ids, attention_mask]",2,"[2, 41, 61206, 59831, 60175, 60477, 59758, 607..."
3,TE_TE_004,కొన్ని వందల వేల సంవత్సరాల నుంచి క్రైస్తవులు ప్...,R,"[[-380.41415, 120.18034, 22.325361, 19.686886,...","[input_ids, token_type_ids, attention_mask]",2,"[2, 41, 60059, 60716, 60085, 59550, 60085, 598..."
4,TE_TE_005,ఇందుమూలముగా సో మంచి సంపూర్ణమైన క్యాస్టు సిస్టమ...,R,"[[-459.5935, 90.28745, 38.36905, 10.045397, 9....","[input_ids, token_type_ids, attention_mask]",2,"[2, 41, 62834, 60175, 60477, 59758, 60465, 606..."


In [None]:
test_data_inputs = [np.array(list(test['mfccs'])), np.array(list(test['input_ids']))]
test_labels = np.array(list(test['labels']))

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_data_inputs, test_labels)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - accuracy: 0.1779 - loss: 2.9323
Test Loss: 2.706005573272705, Test Accuracy: 0.2199999988079071


In [None]:
from sklearn.metrics import f1_score
# Make predictions on the test set
pred_probs = model.predict(test_data_inputs)
pred_labels = np.argmax(pred_probs, axis=-1)  # Convert probabilities to class labels

# Compute macro-averaged F1-score
macro_f1 = f1_score(test_labels, pred_labels, average='macro')
print(f"Macro-Averaged F1-Score: {macro_f1}")

[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m6s[0m 7s/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5s/step
Macro-Averaged F1-Score: 0.1828167256084876
