In [None]:
#source: https://www.kaggle.com/sathianpong/multi-label-classification-baseline/notebook?select=train_preprocessed.csv

In [21]:
train_path = "/home/jason/toxic_model/train_preprocessed.csv"

import warnings

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings("ignore")

In [22]:
df = pd.read_csv(train_path)
df = df.drop(["id", "set", "toxicity"], axis=1)

df.shape
df.head()
df = df.head(500)

(159571, 7)

Unnamed: 0,comment_text,identity_hate,insult,obscene,severe_toxic,threat,toxic
0,explanation why the edits made under my userna...,0.0,0.0,0.0,0.0,0.0,0.0
1,d aww he matches this background colour i m s...,0.0,0.0,0.0,0.0,0.0,0.0
2,hey man i m really not trying to edit war it...,0.0,0.0,0.0,0.0,0.0,0.0
3,more i can t make any real suggestions on im...,0.0,0.0,0.0,0.0,0.0,0.0
4,you sir are my hero any chance you remember...,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
labels = list(df.columns)
labels.remove("comment_text")
labels

['identity_hate', 'insult', 'obscene', 'severe_toxic', 'threat', 'toxic']

In [24]:
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stop_words = set(stopwords.words("english"))
stop_words.update(
    [
        "zero",
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "ten",
        "may",
        "also",
        "across",
        "among",
        "beside",
        "however",
        "yet",
        "within",
    ]
)
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)


def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)


df["comment_text"] = df["comment_text"].apply(removeStopWords)

stemmer = SnowballStemmer("english")


def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


df["comment_text"] = df["comment_text"].apply(stemming)

In [25]:
import tensorflow as tf
from tensorflow.keras import layers

MAX_VOCAB = 500

encoder = layers.experimental.preprocessing.TextVectorization(
    max_tokens=MAX_VOCAB, standardize="lower_and_strip_punctuation"
)
sequences = df["comment_text"].values
targets = df[labels].values
encoder.adapt(sequences)

In [27]:
model = tf.keras.Sequential(
    [
        encoder,
        tf.keras.layers.Embedding(
            input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True
        ),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(targets.shape[1], activation="sigmoid"),
    ]
)

In [28]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=[tf.keras.metrics.CategoricalCrossentropy()],
)

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    sequences, targets, test_size=0.1, random_state=42
)
X_train.shape
y_train.shape

(450,)

(450, 6)

In [30]:
_ = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [63]:
pred = model.predict(X_test)

In [64]:
from sklearn.metrics import classification_report

THRESH = 0.5
for i in range(len(labels)):
    y_true = y_test[:, i]
    y_pred = (pred[:, i] > THRESH).astype(int)
    print(f"======={labels[i]}")
    print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        50

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50

              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97        47
         1.0       0.00      0.00      0.00         3

    accuracy                           0.94        50
   macro avg       0.47      0.50      0.48        50
weighted avg       0.88      0.94      0.91        50

              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98        48
         1.0       0.00      0.00      0.00         2

    accuracy                           0.96        50
   macro avg       0.48      0.50      0.49        50
weighted avg       0.92      0.96      0.94        50

              precision    recall  f1-score   support

         0.0      