In [2]:
!pip install numpy pandas tensorflow keras nltk scikit-learn




In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
import re
import pandas as pd
import numpy as np

In [5]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [7]:
from sklearn.metrics import classification_report, roc_auc_score

In [8]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**Load training data**

In [11]:
df = pd.read_csv('train.csv')

**Create a new binary label column 'label_sum' indicating toxic or non-toxic**

In [12]:
df['label_sum'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
df['label_sum'] = df['label_sum'].apply(lambda x: 1 if x > 0 else 0)

**Clean the text by removing stopwords, non-alphabetic characters, and tokenizing**

In [13]:
def clean_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    text = word_tokenize(text)  # Tokenize the text
    text = [word for word in text if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(text)

**Apply text cleaning to both training and testing data**

In [14]:
df['clean_text'] = df['comment_text'].apply(clean_text)
df[['clean_text', 'label_sum']].to_csv('cleaned_train_data.csv', index=False)

In [15]:
df_test = pd.read_csv('test.csv')
df_test['clean_text'] = df_test['comment_text'].apply(clean_text)
df_test[['clean_text']].to_csv('cleaned_test_data.csv', index=False)

**Reload cleaned training and test data**

In [16]:
train_df = pd.read_csv('cleaned_train_data.csv')
test_df = pd.read_csv('cleaned_test_data.csv')

**Prepare training and testing data**

In [17]:
X_train = train_df['clean_text'].astype(str).values
y_train = train_df['label_sum'].values
X_test = test_df['clean_text'].astype(str).values


**Tokenize the text data**

In [18]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


Pad the sequences to the same length

In [19]:
max_length = 500
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length)

**Split the training data for validation**

In [20]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_padded, y_train, test_size=0.2, random_state=42)

**Build a Convolutional Neural Network (CNN) model**

In [21]:
vocab_size = 20000
embedding_dim = 128

In [23]:
model = Sequential([
    Embedding(vocab_size, embedding_dim),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [24]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
model.summary()

**Train the model**

In [28]:
model.fit(X_train_split, y_train_split, epochs=4, batch_size=32, validation_data=(X_val_split, y_val_split))

Epoch 1/4
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m847s[0m 212ms/step - accuracy: 0.9984 - loss: 0.0049 - val_accuracy: 0.9519 - val_loss: 0.3810
Epoch 2/4
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m859s[0m 212ms/step - accuracy: 0.9983 - loss: 0.0044 - val_accuracy: 0.9522 - val_loss: 0.4846
Epoch 3/4
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m844s[0m 212ms/step - accuracy: 0.9987 - loss: 0.0040 - val_accuracy: 0.9516 - val_loss: 0.5566
Epoch 4/4
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m898s[0m 221ms/step - accuracy: 0.9985 - loss: 0.0043 - val_accuracy: 0.9523 - val_loss: 0.4928


<keras.src.callbacks.history.History at 0x7b4544420550>

In [29]:
model.save('toxicity_cnn_model.keras')

**Make predictions on the test data**

In [30]:
y_pred = model.predict(X_test_padded)
y_pred_classes = (y_pred > 0.5).astype("int32")

[1m4787/4787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 53ms/step


In [31]:
test_df['predicted_label'] = y_pred_classes
test_df[['clean_text', 'predicted_label']].to_csv('test_predictions.csv', index=False)

**Evaluate model performance**

In [32]:
y_val_pred = model.predict(X_val_split)
y_val_pred_classes = (y_val_pred > 0.5).astype("int32")

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 53ms/step


In [33]:
print(classification_report(y_val_split, y_val_pred_classes))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97     28671
           1       0.79      0.73      0.76      3244

    accuracy                           0.95     31915
   macro avg       0.88      0.85      0.86     31915
weighted avg       0.95      0.95      0.95     31915



**Calculate and print ROC-AUC score**

In [34]:
roc_auc = roc_auc_score(y_val_split, y_val_pred)
print(f'Validation ROC-AUC Score: {roc_auc}')

Validation ROC-AUC Score: 0.9461541156074778
