In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd # Import the pandas library

In [3]:
from imblearn.over_sampling import SMOTE
import re # Import the regex library

In [4]:
# Load the dataset (replace 'your_dataset.csv' with the actual file name)
data = pd.read_csv('/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv') 
# Load data into the 'data' variable

In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,rt mayasolovely as a woman you shouldn t...
1,1,3,0,3,0,1,rt mleew17 boy dats cold tyga dwn ba...
2,2,3,0,3,0,1,rt urkindofbrand dawg rt 80sbaby...
3,3,3,0,2,1,1,rt c_g_anderson viva_based she lo...
4,4,6,0,6,0,1,rt shenikaroberts the shit you...


In [5]:
# Text Preprocessing: Lowercase conversion, remove special characters, etc.
def preprocess_text(text):
    text = text.lower() # convert to lowercase
    text = re.sub(r'\W', ' ', text) # remove special characters
    return text

In [6]:
data['tweet'] = data['tweet'].apply(preprocess_text)

# Prepare the data for classification
X = data['tweet']
y = data['class']

In [7]:
X.size

24783

In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Vectorize the tweet data for SVM and Random Forest using TF-IDF with bigrams
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [11]:
# Address Class Imbalance using SMOTE (Apply SMOTE after TF-IDF)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [13]:
X_train_resampled.size

702793

In [14]:
# Define the SVM model
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
# Evaluate both models on the test set
svm_predictions = svm_model.predict(X_test_tfidf)
# Classification report for SVM
svm_report = classification_report(y_test, svm_predictions)

In [15]:
print(svm_report)

              precision    recall  f1-score   support

           0       0.60      0.10      0.17       290
           1       0.90      0.97      0.94      3832
           2       0.85      0.80      0.82       835

    accuracy                           0.89      4957
   macro avg       0.79      0.62      0.64      4957
weighted avg       0.88      0.89      0.87      4957



In [16]:
# Define the Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train)
rf_predictions = rf_model.predict(X_test_tfidf)
# Classification report for Random Forest
rf_report = classification_report(y_test, rf_predictions)

In [17]:
print(rf_report)

              precision    recall  f1-score   support

           0       0.45      0.10      0.16       290
           1       0.90      0.97      0.93      3832
           2       0.85      0.77      0.81       835

    accuracy                           0.88      4957
   macro avg       0.73      0.61      0.63      4957
weighted avg       0.86      0.88      0.87      4957



In [18]:
# Now let's prepare the data for the LSTM model
max_words = 10000
max_len = 100

# Tokenizing and padding for LSTM
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [19]:
# Define a deeper LSTM model
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5), # Adding dropout for regularization
    tf.keras.layers.Dense(3, activation='softmax')
])

lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

# Evaluate the LSTM model
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test)


Epoch 1/5




[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 263ms/step - accuracy: 0.8195 - loss: 0.5284 - val_accuracy: 0.9022 - val_loss: 0.2957
Epoch 2/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 260ms/step - accuracy: 0.9191 - loss: 0.2499 - val_accuracy: 0.9038 - val_loss: 0.2769
Epoch 3/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 264ms/step - accuracy: 0.9400 - loss: 0.1751 - val_accuracy: 0.8967 - val_loss: 0.3082
Epoch 4/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 250ms/step - accuracy: 0.9589 - loss: 0.1181 - val_accuracy: 0.8931 - val_loss: 0.4087
Epoch 5/5
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 252ms/step - accuracy: 0.9732 - loss: 0.0785 - val_accuracy: 0.8648 - val_loss: 0.4951
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 62ms/step - accuracy: 0.8594 - loss: 0.5248


In [20]:
# Generate classification reports for SVM, Random Forest, and LSTM
print("SVM Report:\n", svm_report)
print("Random Forest Report:\n", rf_report)
print("LSTM Accuracy: ", lstm_accuracy)

SVM Report:
               precision    recall  f1-score   support

           0       0.60      0.10      0.17       290
           1       0.90      0.97      0.94      3832
           2       0.85      0.80      0.82       835

    accuracy                           0.89      4957
   macro avg       0.79      0.62      0.64      4957
weighted avg       0.88      0.89      0.87      4957

Random Forest Report:
               precision    recall  f1-score   support

           0       0.45      0.10      0.16       290
           1       0.90      0.97      0.93      3832
           2       0.85      0.77      0.81       835

    accuracy                           0.88      4957
   macro avg       0.73      0.61      0.63      4957
weighted avg       0.86      0.88      0.87      4957

LSTM Accuracy:  0.8648375868797302
