In [11]:
import pandas as pd

# Load dataset
file_path = "/content/hate_speech.csv"
df = pd.read_csv(file_path)

# Rename columns if necessary
df.rename(columns={'class': 'label', 'tweet': 'text'}, inplace=True)

# Display first few rows
print(df.head())


   label                                               text
0      2  !!! RT @mayasolovely: As a woman you shouldn't...
1      1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2      1  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3      1  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4      1  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [12]:
import re

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply text cleaning
df['text'] = df['text'].astype(str).apply(clean_text)


In [13]:
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Convert labels to categorical (0: Non-hate, 1: Offensive, 2: Hate Speech)
y = to_categorical(df['label'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=42)


In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define tokenizer parameters
MAX_NB_WORDS = 50000  # Max unique words
MAX_SEQUENCE_LENGTH = 100  # Max words per text

# Tokenization
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences and pad them
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_SEQUENCE_LENGTH)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_SEQUENCE_LENGTH)


In [18]:
import os

# Update path to GloVe embeddings file
glove_path = "glove.6B.100d.txt"

# Load GloVe embeddings
embeddings_index = {}
with open(glove_path, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        embeddings_index[word] = np.asarray(values[1:], dtype='float32')


In [20]:
import numpy as np



# Define embedding parameters
EMBEDDING_DIM = 100  # Size of GloVe vectors



# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < MAX_NB_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D

# Define LSTM model
model = Sequential([
    Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False),
    SpatialDropout1D(0.2),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(y_train.shape[1], activation='softmax')  # Output layer for multi-class classification
])

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Show model summary
model.summary()




In [24]:
history = model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_test_seq, y_test))


Epoch 1/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 348ms/step - accuracy: 0.7748 - loss: 0.9123 - val_accuracy: 0.7730 - val_loss: 0.8105
Epoch 2/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 338ms/step - accuracy: 0.7754 - loss: 0.7857 - val_accuracy: 0.7730 - val_loss: 0.7360
Epoch 3/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 320ms/step - accuracy: 0.7762 - loss: 0.7213 - val_accuracy: 0.7730 - val_loss: 0.7005
Epoch 4/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 315ms/step - accuracy: 0.7714 - loss: 0.6981 - val_accuracy: 0.7730 - val_loss: 0.6833
Epoch 5/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 339ms/step - accuracy: 0.7717 - loss: 0.6832 - val_accuracy: 0.7730 - val_loss: 0.6746


In [25]:
# Evaluate model performance
loss, accuracy = model.evaluate(X_test_seq, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 51ms/step - accuracy: 0.7743 - loss: 0.6742
Test Accuracy: 0.7730


In [28]:
def predict_text(text):
    cleaned_text = clean_text(text)  # Clean input text
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)

    prediction = model.predict(padded_sequence)
    label = np.argmax(prediction)  # Get class index

    # Map numeric label back to category name
    label_mapping = {0: "Non-Hate Speech", 1: "Offensive Language", 2: "Hate Speech"}
    return label_mapping[label]

# Example prediction
print(predict_text("I hate this person!"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Offensive Language
