# End-to-End Hate Speech Detection with Python

## Introduction
Hate speech is a serious issue on social media platforms. In this notebook, we will build an end-to-end hate speech detection system using Python and deep learning.

## Requirements

In [7]:
import pandas as pd

# Load dataset
data_url = 'twitter.csv'
df = pd.read_csv(data_url)

# Display the first few rows
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


## Data Preprocessing

In [12]:
import re
from sklearn.model_selection import train_test_split

def preprocess_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing to the 'tweet' column
df['cleaned_text'] = df['tweet'].apply(preprocess_text)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['class'], test_size=0.2, random_state=42)

## Building the Model

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = max([len(x) for x in X_train_seq])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Building the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_length))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))  # Change to 3 classes

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [17]:
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 25ms/step - accuracy: 0.7605 - loss: 0.6978 - val_accuracy: 0.7730 - val_loss: 0.6665
Epoch 2/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.7751 - loss: 0.6627 - val_accuracy: 0.7730 - val_loss: 0.6656
Epoch 3/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.7768 - loss: 0.6607 - val_accuracy: 0.7730 - val_loss: 0.6671
Epoch 4/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.7776 - loss: 0.6531 - val_accuracy: 0.7730 - val_loss: 0.6653
Epoch 5/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step - accuracy: 0.7727 - loss: 0.6691 - val_accuracy: 0.7730 - val_loss: 0.6668


<keras.src.callbacks.history.History at 0x273a546bf50>

In [19]:
def predict_hate_speech(text):
    cleaned_text = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    prediction = model.predict(padded)
    classes = ['Hate Speech', 'Offensive Language', 'Neither']
    return classes[prediction.argmax()]

In [21]:
predict_hate_speech("Let those who stole my smiles, those condemned to her eternal prison, stay.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step


'Offensive Language'