In [89]:
import pandas as pd
import numpy as np
import pickle
import nltk
import random
from sklearn.model_selection import train_test_split
from pre_processing.textProcessing import TextPreProcessor
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras.layers import TextVectorization
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import precision_score, recall_score, auc, accuracy_score

def evaluate_classifier(y_true, y_pred):
    accr=accuracy_score(y_true, y_pred)
    precision=precision_score(y_true,y_pred)
    recall=recall_score(y_true,y_pred)
    print(f"Accuracy: {accr}, Precision: {precision}, Recall: {recall}")
    return (accr, precision, recall)

%matplotlib inline

# Data

In [105]:
train = pd.read_csv('../data/train.csv')
labels = train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]

In [106]:
random.seed(923)

X_train, X_test, y_train, y_test = train_test_split(train.comment_text,
                                                    labels, test_size = 0.25,random_state = 23)

X_train.reset_index(drop = True,inplace = True)
X_test.reset_index(drop = True,inplace = True)
y_train.reset_index(drop = True,inplace = True)
y_test.reset_index(drop = True,inplace = True)

## Pre Processing

In [107]:
VOCAB_SIZE = 5000
MAX_SEQUENCE_LENGTH = 150
# Create the tokenizer
t = Tokenizer()
# Fit the tokenizer on the documents
t.fit_on_texts(X_train)

"""
The word index for keras Tokenizer is ordered based on frequency. Therefore we can do the following according to
https://github.com/keras-team/keras/issues/8092
"""
t.oov_token = '_unknown_'
t.word_index = {e:i for e,i in t.word_index.items() if i <= VOCAB_SIZE} # <= because tokenizer is 1 indexed
t.word_index[t.oov_token] = VOCAB_SIZE + 1

"""
Apply the tokenizer
"""
encoded_docs = t.texts_to_sequences(X_train)

"""
Padd the sequences
"""
padded_docs = pad_sequences(encoded_docs, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')

print("Original text:")
print(X_train[0])
print("Vectorized:")
print(encoded_docs[0])
print("Padded:")
print(padded_docs[0])

Original text:
That's correct. A smaller diameter is held less securely by the sphincter, and is less efficient at allowing gas through.
Vectorized:
[203, 365, 5, 2894, 5001, 8, 1603, 460, 5001, 31, 1, 5001, 4, 8, 460, 5001, 34, 2946, 2853, 334]
Padded:
[ 203  365    5 2894 5001    8 1603  460 5001   31    1 5001    4    8
  460 5001   34 2946 2853  334    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0 

## Create Model

In [108]:
def create_model(vocab_size, num_labels, sequence_length):
    model = tf.keras.Sequential([
        layers.Input(shape=(sequence_length,)),
        layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=sequence_length, mask_zero=True),
        layers.Conv1D(filters = 32, kernel_size = 5,padding = 'same'),
        layers.MaxPooling1D(pool_size = 5,strides = 1,padding = 'same'),
        layers.Dropout(0.4),
        layers.Bidirectional(layers.LSTM(64,return_sequences = False,dropout = 0.2,recurrent_dropout = 0.2)),
        layers.Dense(50,activation = 'relu'),
        layers.Dense(num_labels,activation = 'sigmoid')
    ])
    return model

In [109]:
model = create_model(len(t.word_index)+1, num_labels=6, sequence_length=MAX_SEQUENCE_LENGTH)
model.compile(
    loss=losses.binary_crossentropy,
    optimizer='adam',
    metrics=['accuracy']
)

In [110]:
"""
Defining early-stopping callback
"""
from keras.callbacks import EarlyStopping
# Define an early-stopping callback
cb = [EarlyStopping(monitor='val_loss',patience = 2)]

In [111]:
"""
Training the model
"""
batch_size = 32
num_epochs = 20

model.fit(padded_docs, y_train,validation_split=0.2,\
          batch_size=batch_size, epochs=num_epochs,verbose=1,callbacks =cb)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


<keras.callbacks.History at 0x7f85bc7bea58>

## Scoring the model



In [112]:
encoded_doc_test = t.texts_to_sequences(X_test)
padded_doc_test = pad_sequences(encoded_doc_test, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')

In [113]:
pred = model.predict(padded_doc_test)



In [114]:
## Performances:
for i in range(6):
    y_pred_bin = [x>0.5 for x in pred[:,i]]
    print(f"\n{y_test.columns[i]}:")
    print(f"Benchmark: {1-np.mean(y_test.iloc[:,i])}")
    evaluate_classifier(y_pred_bin, y_test.iloc[:,i]);


toxic:
Benchmark: 0.9032161030757275
Accuracy: 0.9606447246384078, Precision: 0.674954674954675, Recall: 0.8921602191030469

severe_toxic:
Benchmark: 0.9897475747624896
Accuracy: 0.9897475747624896, Precision: 0.05378973105134474, Recall: 0.5

obscene:
Benchmark: 0.9477853257463715
Accuracy: 0.9808738375153536, Precision: 0.7542006721075372, Recall: 0.8622392974753018

threat:
Benchmark: 0.997217556964881
Accuracy: 0.997217556964881, Precision: 0.0, Recall: 0.0

insult:
Benchmark: 0.9508435063795654


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.9730027824430351, Precision: 0.6435492095869454, Recall: 0.7695121951219512

identity_hate:
Benchmark: 0.9909257263179004
Accuracy: 0.9909257263179004, Precision: 0.0, Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
