In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import zipfile

from imblearn.over_sampling import SMOTE
from keras import models
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.python.framework.errors_impl import InvalidArgumentError
from tensorflow.python.keras import backend as K

In [None]:
MAX_VOCAB = 500

cntizer = CountVectorizer(analyzer="word", max_features=MAX_VOCAB, max_df=0.7, min_df=0.1)
tfizer = TfidfTransformer()
smote = SMOTE()

stop_words = set(stopwords.words("english"))
stop_words.update(
    [
        "zero",
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "ten",
        "may",
        "also",
        "across",
        "among",
        "beside",
        "however",
        "yet",
        "within",
    ]
)
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)

stemmer = SnowballStemmer("english")


def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

def get_labels(df):
    df = df.drop(["id", "set", "toxicity"], axis=1)
    labels = list(df.columns)
    labels.remove("comment_text")
    return labels

def pre_process(df):
    labels = get_labels(df)

    df["comment_text"] = df["comment_text"].apply(removeStopWords)
    df["comment_text"] = df["comment_text"].apply(stemming)

    sequences = df["comment_text"].values
    targets = df[labels].values

    return sequences, targets, labels

In [None]:
def calculating_class_weights(y_true):
    number_dim = np.shape(y_true)[1]
    weights = np.empty([number_dim, 2])
    for i in range(number_dim):
        weights[i] = compute_class_weight('balanced', [0.,1.], y_true[:, i])
    return weights

def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
        return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)
    return weighted_loss

def load_model(compile=True):
    return models.load_model('toxic_model', compile=compile)

def classify(sample, model, labels):
    predictions = {}
    try:
        prediction = model.predict([sample])
    except InvalidArgumentError as e:
        print(e.message)
        return predictions

    for l in range(len(labels)):
        predictions[labels[l]] = True if prediction[0][l] > 0.5 else False
    
    return predictions

def get_encoder(sequences):
    encoder = TextVectorization(
        max_tokens=MAX_VOCAB, standardize="lower_and_strip_punctuation"
    )

    encoder.adapt(sequences)
    return encoder


def get_model(sequences, targets, loss_function):
    encoder = get_encoder(sequences)

    model = Sequential(
        [
            encoder,
            Embedding(
                input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True
            ),
            LSTM(64),
            Dense(64, activation="relu"),
#             Dense(targets.shape[0], activation="sigmoid"),
            Dense(targets.shape[1], activation="sigmoid"),
        ]
    )

    model.compile(
        loss=loss_function,
        optimizer=Adam(1e-4),
        metrics=[CategoricalCrossentropy()],
    )

    return model


def train(sequences, targets, labels, model):
# def train(sequences, targets, labels, loss_function):
#     model = get_model(sequences, targets, loss_function)

    X_train, X_test, y_train, y_test = train_test_split(
        sequences, targets, test_size=0.2, random_state=42
    )

    model.fit(
        X_train,
        y_train,
        epochs=20,
        batch_size=16,
        validation_data=(X_test, y_test),
        callbacks=[EarlyStopping(patience=5)],
    )

    pred = model.predict(X_test)

    THRESH = 0.5
    for i in range(len(labels)):
        y_true = y_test[:, i]
        y_pred = (pred[:, i] > THRESH).astype(int)
        print(f"======={labels[i]}")
        print(classification_report(y_true, y_pred))

    model.save('toxic_model')



In [None]:
READ_CSV = True
if READ_CSV:
    df = pd.read_csv('/kaggle/input/cleaned-toxic-comments/train_preprocessed.csv')
    print(df.shape)
# df = df.head(1000)
print(df.shape)

In [None]:
GENERATE_INPUTS = True
if GENERATE_INPUTS:
    sequences, targets, labels = pre_process(df)
else:
    labels = get_labels(df)
    sequences = np.load('sequences.p', allow_pickle=True)
    targets = np.load('targets.p', allow_pickle=True)
print(sequences.shape)
print(targets.shape)

In [None]:
SAVE_INPUTS = True
if SAVE_INPUTS:
    with open('sequences.p', 'wb') as f:
        np.save(f, sequences)
    with open('targets.p', 'wb') as f:
        np.save(f, targets)

In [None]:
# USE_WEIGHTED_LOSS = False
# if USE_WEIGHTED_LOSS:
#     class_weights = calculating_class_weights(targets)
#     print(class_weights)
#     loss_function = get_weighted_loss(class_weights)
# else:
#     loss_function = BinaryCrossentropy(from_logits=False)

# train(sequences, targets, labels, loss_function)

In [None]:
class_weights = calculating_class_weights(targets)
weighted_model = get_model(sequences, targets, get_weighted_loss(class_weights))
unweighted_model = get_model(sequences, targets, BinaryCrossentropy(from_logits=False))

# classifier = VotingClassifier(estimators=[('weighted_model', weighted_model), ('unweighted_model', unweighted_model)], voting='hard')
train(sequences, targets, labels, weighted_model)
train(sequences, targets, labels, unweighted_model)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    sequences, targets, test_size=0.2, random_state=42
)

weighted_pred = weighted_model.predict(X_test)
unweighted_model = unweighted_model.predict(X_test)

In [None]:
# np.mean(weighted_pred, unweighted_model)
pred = np.mean( np.array([ weighted_pred, unweighted_model ]), axis=0 )
THRESH = 0.5
for i in range(len(labels)):
    y_true = y_test[:, i]
    y_pred = (pred[:, i] > THRESH).astype(int)
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred), display_labels=[0, 1])
    print(labels[i])
    print(classification_report(y_true, y_pred))
    disp.plot()
    plt.show()

In [None]:
ZIP_MODEL = True

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file), 
                       os.path.relpath(os.path.join(root, file), 
                                       os.path.join(path, '..')))

if ZIP_MODEL:
    zipf = zipfile.ZipFile('toxic_model.zip', 'w', zipfile.ZIP_DEFLATED)
    zipdir('toxic_model/', zipf)
    zipf.close()


In [None]:
model = load_model(compile=False)
model.compile()

In [None]:



X_train, X_test, y_train, y_test = train_test_split(
    sequences, targets, test_size=0.2, random_state=42
)


pred = model.predict(X_test)

THRESH = 0.5
for i in range(len(labels)):
    y_true = y_test[:, i]
    y_pred = (pred[:, i] > THRESH).astype(int)
    disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true, y_pred), display_labels=[0, 1])
    print(labels[i])
    print(classification_report(y_true, y_pred))
    disp.plot()
    plt.show()
#     print(f"======={labels[i]}")
#     print(confusion_matrix(y_true, y_pred))