<a href="https://colab.research.google.com/github/iskanor1/Comments-were-classified-as-toxic-and-non-toxic-using-a-binary-LSTM/blob/main/Comments_were_classified_as_toxic_and_non_toxic_using_a_binary_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

In [3]:

url = "https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv"
df = pd.read_csv(url)

In [4]:
df.rename(columns={'tweet': 'comment_text'}, inplace=True)
df['toxic'] = df['class'].apply(lambda x: 1 if x in [0, 1] else 0)

MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 150
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df['comment_text'].astype(str))
sequences = tokenizer.texts_to_sequences(df['comment_text'].astype(str))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(df['toxic'])

X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, random_state=42
)

In [None]:
model = Sequential([
    Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.3)),
    Dropout(0.5),
    BatchNormalization(),
    Bidirectional(LSTM(32, recurrent_dropout=0.3)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=1e-3),
    metrics=['accuracy']
)

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=4,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

model.save("toxic_comment_model.h5")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

def predict_comment(comment):
    seq = tokenizer.texts_to_sequences([comment])
    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
    pred = model.predict(padded)[0][0]
    label = 1 if pred > 0.5 else 0
    print(f"\nComment: {comment}")
    print(f"Toxicity Prediction: {'TOXIC' if label==1 else 'NON-TOXIC'} (Score: {pred:.4f})\n")



Epoch 1/4
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 786ms/step - accuracy: 0.8605 - loss: 0.3660 - val_accuracy: 0.9586 - val_loss: 0.1294
Epoch 2/4
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 782ms/step - accuracy: 0.9674 - loss: 0.0939 - val_accuracy: 0.9395 - val_loss: 0.1428
Epoch 3/4
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 775ms/step - accuracy: 0.9840 - loss: 0.0516 - val_accuracy: 0.9511 - val_loss: 0.1558
Epoch 4/4
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 741ms/step - accuracy: 0.9894 - loss: 0.0375

In [None]:
while True:
    user_input = input("Enter a comment (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    predict_comment(user_input)

Enter a comment (or 'exit' to quit): fuck you mother fucker
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step

Comment: fuck you mother fucker
Toxicity Prediction: TOXIC (Score: 0.9987)

Enter a comment (or 'exit' to quit): i love you 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step

Comment: i love you 
Toxicity Prediction: NON-TOXIC (Score: 0.1746)

Enter a comment (or 'exit' to quit): can you marry me?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step

Comment: can you marry me?
Toxicity Prediction: NON-TOXIC (Score: 0.0764)

Enter a comment (or 'exit' to quit): i will burn world for you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step

Comment: i will burn world for you
Toxicity Prediction: NON-TOXIC (Score: 0.0491)

Enter a comment (or 'exit' to quit): kosamk
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step

Comment: kosamk
Toxicity Prediction: NON-TOXIC (Score: 0.231