# Fusion Based Hate Speech Detection

This project demonstrates a full NLP pipeline for hate speech detection on social media text


**Goal**: Classify tweets into hate speech or not, and evaluate model performance.


# Importing necessary packages

In [3]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.svm import SVC
import tensorflow_hub as hub
from collections import Counter
import matplotlib.pyplot as plt
import gensim.downloader as api
from sklearn.metrics import confusion_matrix
from tensorflow.keras.regularizers import l2
from tensorflow.keras.activations import gelu
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from transformers import BertTokenizer, TFBertModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout




# Loaded English model from spacy

In [4]:
import spacy
nlp = spacy.load("en_core_web_lg")

# Loading Dataset

In [6]:
train=pd.read_csv('train_en.tsv',sep='\t',on_bad_lines='skip')
test=pd.read_csv('test_en.tsv',sep='\t',on_bad_lines='skip')

In [7]:
df=pd.concat([train,test])
df.dropna(inplace=True)
df.head()

Unnamed: 0,id,text,HS,TR,AG
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,0,0
1,202,Why would young fighting age men be the vast m...,1,0,0
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,0,0
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,0,0
4,205,Orban in Brussels: European leaders are ignori...,0,0,0


# Preprocessing

In [8]:
def preprocess(text):
    # Convert text to string to handle potential float values
    text = str(text)
    text = re.sub(r'http\S+', '', text)
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & pictographs
        "\U0001F680-\U0001F6FF"  # Transport & map symbols
        "\U0001F700-\U0001F77F"  # Alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric shapes
        "\U0001F800-\U0001F8FF"  # Supplemental arrows
        "\U0001F900-\U0001F9FF"  # Supplemental symbols & pictographs
        "\U0001FA00-\U0001FA6F"  # Chess pieces, symbols
        "\U0001FA70-\U0001FAFF"  # Symbols for activities
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+",
        flags=re.UNICODE
    )
    text = emoji_pattern.sub(r"",text)
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(r"&[^\s;]+;",' ',text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [9]:
df['text'] = df['text'].map(preprocess)
df.head()

Unnamed: 0,id,text,HS,TR,AG
0,201,hurray saving us in so many ways potus realdon...,1,0,0
1,202,why would young fighting age men be the vast m...,1,0,0
2,203,kamalaharris illegals dump their kids at the b...,1,0,0
3,204,ny times 'nearly all white' states pose 'an ar...,0,0,0
4,205,orban in brussels european leaders are ignorin...,0,0,0


In [10]:
nlp = spacy.load("en_core_web_lg")
all_tokens = []
for doc in nlp.pipe(df['text'], disable=["ner", "parser"]):
    all_tokens.extend([token.lemma_.lower() for token in doc if token.is_alpha])

word_freq = Counter(all_tokens)
rare_words = set([word for word, freq in word_freq.items() if freq < 3])

def preprocessed(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        lemma = token.lemma_.lower()
        if token.is_alpha and lemma not in rare_words:
            filtered_tokens.append(lemma)
    return ' '.join(filtered_tokens)

In [11]:
df['text'] = df['text'].map(preprocessed)
df.head()

Unnamed: 0,id,text,HS,TR,AG
0,201,save way potus realdonaldtrump lockthemup buil...,1,0,0
1,202,young fighting age man vast majority one escap...,1,0,0
2,203,kamalaharris illegal dump kid border like road...,1,0,0
3,204,ny nearly white state pose problem immigrant,0,0,0
4,205,orban brussels european leader ignore people w...,0,0,0


# Pretrained word2vec model

In [12]:
wv = api.load('word2vec-google-news-300')

# Logistic Regression

In [14]:
#Logistic Regression
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

# Function to average word vectors for each tweet
def get_average_embedding(sequence, wv, dim=300):
    vectors = [wv[tokenizer.index_word[idx]] for idx in sequence if idx in tokenizer.index_word and tokenizer.index_word[idx] in wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

# Prepare features
X = np.array([get_average_embedding(seq, wv) for seq in sequences])
y = df['HS'].values

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression model
lr_model = LogisticRegression(
    penalty='l2',             
    C=0.1,                    
    solver='liblinear',       
    max_iter=2000,            
    class_weight='balanced', 
    random_state=42           
)
lr_model.fit(x_train, y_train)

In [15]:
# Predict and evaluate
y_pred_LR = lr_model.predict(x_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred_LR)}')
print(classification_report(y_test, y_pred_LR))

Accuracy: 0.6963151207115629
              precision    recall  f1-score   support

           0       0.74      0.72      0.73      1326
           1       0.65      0.67      0.66      1035

    accuracy                           0.70      2361
   macro avg       0.69      0.69      0.69      2361
weighted avg       0.70      0.70      0.70      2361



# LSTM

In [24]:
# LSTM Model with Stacked LSTM Layers
from tensorflow.keras.initializers import Constant

vocab_size = len(tokenizer.word_index) + 1
max_len = 100
x_padded = pad_sequences(sequences, maxlen=max_len, padding='post')
y = df['HS'].values
x_train, x_test, y_train, y_test = train_test_split(x_padded, y, test_size=0.2, random_state=42)

# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
    if word in wv:
        embedding_matrix[index] = wv[word]

# Build the model
model = Sequential()
from tensorflow.keras.initializers import Constant

model.add(Embedding(input_dim=vocab_size,
                    output_dim=300,
                    embeddings_initializer=Constant(embedding_matrix),
                    trainable=False))
model.add(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))
model.add(LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='gelu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
# Compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
model.fit(x_train, y_train, epochs=5, batch_size=64)

Epoch 1/5
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 155ms/step - accuracy: 0.6194 - loss: 0.6528
Epoch 2/5
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 154ms/step - accuracy: 0.6809 - loss: 0.5956
Epoch 3/5
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 171ms/step - accuracy: 0.6918 - loss: 0.5861
Epoch 4/5
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 163ms/step - accuracy: 0.6990 - loss: 0.5741
Epoch 5/5
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 155ms/step - accuracy: 0.7066 - loss: 0.5633


<keras.src.callbacks.history.History at 0x2298f7eaab0>

In [25]:
# Predict
y_pred_LSTM = model.predict(x_test)
y_pred_LSTM = (y_pred_LSTM >= 0.5)

# Evaluate
print(f'Accuracy: {accuracy_score(y_test, y_pred_LSTM)}')
print(classification_report(y_test, y_pred_LSTM))

[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 58ms/step
Accuracy: 0.7035154595510377
              precision    recall  f1-score   support

           0       0.70      0.81      0.75      1326
           1       0.70      0.56      0.63      1035

    accuracy                           0.70      2361
   macro avg       0.70      0.69      0.69      2361
weighted avg       0.70      0.70      0.70      2361



# CNN

In [33]:
# CNN Model
model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=300,
                    embeddings_initializer=Constant(embedding_matrix),
                    trainable=False))
# First Conv + MaxPool
model.add(Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))

model.add(Dropout(0.5))

# Second Conv + MaxPool
model.add(Conv1D(filters=64, kernel_size=5, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))

# Global Pooling + Dense Layers
model.add(GlobalMaxPooling1D())  # Required to flatten before dense
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=10)


Epoch 1/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.6168 - loss: 0.6501
Epoch 2/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.6975 - loss: 0.5748
Epoch 3/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.7392 - loss: 0.5307
Epoch 4/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.7636 - loss: 0.4781
Epoch 5/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.8027 - loss: 0.4329
Epoch 6/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.8319 - loss: 0.3748
Epoch 7/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.8563 - loss: 0.3405
Epoch 8/10
[1m296/296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.8740 - loss: 0.3047
Epoch 9/10
[1m296/296[0m [32

<keras.src.callbacks.history.History at 0x229b42fc410>

In [76]:
# Predict
y_pred_CNN = model.predict(x_test)
y_pred_CNN = (y_pred_CNN >= 0.5)

# Evaluate
print(f'Accuracy: {accuracy_score(y_test, y_pred_CNN)}')
print(classification_report(y_test, y_pred_CNN))

              precision    recall  f1-score   support

           0       0.85      0.94      0.90        50
           1       0.93      0.84      0.88        50

    accuracy                           0.89       100
   macro avg       0.89      0.89      0.89       100
weighted avg       0.89      0.89      0.89       100



# BERT Embedding + SVM Classifier

# Load tokenizer and BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
bert_model.trainable = False

In [36]:
def get_bert_embeddings(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='tf', padding=True, truncation=True, max_length=128)
        outputs = bert_model(**inputs)
        pooled_output = outputs.pooler_output
        embeddings.append(pooled_output)
    return tf.concat(embeddings, axis=0)

texts = df['text'].tolist()
labels = df['HS'].values

x_train_texts, x_test_texts, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2,stratify=labels, random_state=42
)

x_train_embeddings_bert = get_bert_embeddings(x_train_texts)
x_test_embeddings_bert = get_bert_embeddings(x_test_texts)

In [37]:
svm = SVC(kernel='rbf', class_weight='balanced',C=10,gamma=0.01,probability=True)
svm.fit(x_train_embeddings_bert, y_train)

In [72]:
# Predict and evaluate
y_pred_bert = model.predict(x_test_embeddings_bert)
y_pred_bert = (y_pred_bert >= 0.5).astype(int)

print(f'Accuracy: {accuracy_score(y_test, y_pred_bert)}')
print(classification_report(y_test, y_pred_bert))

              precision    recall  f1-score   support

           0       0.88      0.98      0.92        50
           1       0.98      0.86      0.91        50

    accuracy                           0.92       100
   macro avg       0.93      0.92      0.92       100
weighted avg       0.93      0.92      0.92       100



# ELMo + ANN

In [116]:
import warnings
warnings.filterwarnings('ignore')

# Load the ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")

# Function to get ELMo embeddings
def get_elmo_embeddings(texts, batch_size=8):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = elmo.signatures["default"](tf.constant(batch_texts))["default"]
        embeddings.append(batch_embeddings)
    return tf.concat(embeddings, axis=0)

texts = df['text'].tolist()
labels = df['HS'].values

# Train-test split
x_train_texts, x_test_texts, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Extract ELMo embeddings
x_train_embeddings = get_elmo_embeddings(x_train_texts)
x_test_embeddings = get_elmo_embeddings(x_test_texts)

In [41]:
# Build the neural network
model = Sequential()
model.add(Dense(256, activation='gelu', input_shape=(1024,)))
model.add(Dropout(0.7))
model.add(Dense(128, activation='gelu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='gelu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(x_train_embeddings, y_train, epochs=30, batch_size=8)

Epoch 1/30
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.5971 - loss: 0.6737
Epoch 2/30
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.6611 - loss: 0.6106
Epoch 3/30
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6898 - loss: 0.5796
Epoch 4/30
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6875 - loss: 0.5822
Epoch 5/30
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6973 - loss: 0.5549
Epoch 6/30
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7174 - loss: 0.5504
Epoch 7/30
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7147 - loss: 0.5437
Epoch 8/30
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7225 - loss: 0.5411
Epoch 9/30
[1m1181/1181

<keras.src.callbacks.history.History at 0x229bc198410>

In [74]:
# Predict and evaluate
y_pred_elmo = model.predict(x_test_embeddings)
y_pred_elmo = (y_pred_elmo >= 0.5).astype(int)

print(f'Accuracy: {accuracy_score(y_test, y_pred_elmo)}')
print(classification_report(y_test, y_pred_elmo))

              precision    recall  f1-score   support

           0       0.76      0.88      0.81        50
           1       0.86      0.72      0.78        50

    accuracy                           0.80       100
   macro avg       0.81      0.80      0.80       100
weighted avg       0.81      0.80      0.80       100



# Fusion of Best Performed Models

In [94]:
#Max Fusion
y_pred_bert = y_pred_bert.flatten()
y_pred_elmo = y_pred_elmo.flatten()
y_pred_CNN = y_pred_CNN.flatten()


y_pred_max = np.maximum(np.asarray(y_pred_bert, dtype=np.int32),
                        np.asarray(y_pred_elmo, dtype=np.int32),
                        np.asarray(y_pred_CNN, dtype=np.int32))

# Evaluate
print(f'Accuracy: {accuracy_score(y_test, y_pred_max)}')
print(classification_report(y_test, y_pred_max))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87        50
           1       0.91      0.80      0.85        50

    accuracy                           0.86       100
   macro avg       0.87      0.86      0.86       100
weighted avg       0.87      0.86      0.86       100



In [114]:
#Mean Fusion
y_pred_mean = np.mean([y_pred_bert, y_pred_elmo, y_pred_CNN], axis=0)

y_pred_final = (y_pred_mean >= 0.5).astype(int)


print(f'Accuracy: {accuracy_score(y_test, y_pred_final)}')
print(classification_report(y_test, y_pred_final))


              precision    recall  f1-score   support

           0       0.98      0.94      0.96        50
           1       0.94      0.98      0.96        50

    accuracy                           0.96       100
   macro avg       0.96      0.96      0.96       100
weighted avg       0.96      0.96      0.96       100

