In [0]:
from google.colab import drive

In [6]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from keras.models import Sequential
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from keras.layers import Activation
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import time

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
drive.mount('/content/gdrive')

In [8]:
# load dataset
root_path = 'gdrive/My Drive/NLP/dataset/ToxicAnalysis/'
file = root_path + 'train.csv'
df = pd.read_csv(file)
# check for null values
is_null = df.columns[df.isnull().any()]
print(df[is_null].isnull().sum())
print(df.describe())
# extract text and labels
comments = df['comment_text'].values
labels = df.loc[:,['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
size = int(len(comments) * 0.3)
comments = comments[:size]
labels = labels[:size,:]
comments.shape, labels.shape

Series([], dtype: float64)
               toxic   severe_toxic  ...         insult  identity_hate
count  159571.000000  159571.000000  ...  159571.000000  159571.000000
mean        0.095844       0.009996  ...       0.049364       0.008805
std         0.294379       0.099477  ...       0.216627       0.093420
min         0.000000       0.000000  ...       0.000000       0.000000
25%         0.000000       0.000000  ...       0.000000       0.000000
50%         0.000000       0.000000  ...       0.000000       0.000000
75%         0.000000       0.000000  ...       0.000000       0.000000
max         1.000000       1.000000  ...       1.000000       1.000000

[8 rows x 6 columns]


((47871,), (47871, 6))

In [0]:
# clean and preprocess text
def clean_doc(doc, remove_stop_words=True):
  cleaned_doc = list()
  stop_words = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  for sentence in doc:
    token = text_to_word_sequence(sentence)
    table = str.maketrans('', '', string.punctuation)
    token = [word.translate(table) for word in token]
    token = [word for word in token if word.isalpha()]
    if remove_stop_words:
      token = [word for word in token if not word in stop_words]
    token = [word for word in token if len(word) > 1]
    token = [stemmer.stem(word) for word in token]
    token = ' '.join(token)
    cleaned_doc.append(token)
  return cleaned_doc   

In [0]:
""" 
this function splits data to return either all non-toxic samples
or samples identified in one or more toxic categories
"""
def get_toxic_data(x, y, for_toxic=True):
  label = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
  new_x = list()
  new_y = list()
  for idx in range(len(x)):
    if for_toxic and 1 in y[idx,:]:
      new_x.append(x[idx,:])
      new_y.append(y[idx,:])
    if not for_toxic and not 1 in y[idx,:]:
      new_x.append(x[idx,:])
      new_y.append(y[idx,:])
  return np.array(new_x), np.array(new_y)

# get sample count for each label
def get_count(y):
  label = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
  for idx in range(len(label)):
    count = [i for i in y[:,idx] if i == 1]
    print(f'{label[idx]} comments: {len(count)}')
  x, toxic = get_toxic_data(y, y)
  print(f'overall toxic comments: {len(toxic)}')
  print(f'overall non-toxic comments: {len(y) - len(toxic)}')

In [11]:
# call function to perform cleaning on text
train = clean_doc(comments)
train = np.array(train)
target = labels
train.shape, target.shape

((47871,), (47871, 6))

In [12]:
# tokenize text and generate word vocabulary/dictionary
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train)
encoded = tokenizer.texts_to_sequences(train)
vocab = tokenizer.word_index
max_length = max([len(s.split()) for s in train])
feature_vec = pad_sequences(encoded, maxlen=max_length, padding='post')
x_train, x_test, y_train, y_test = train_test_split(feature_vec, target, test_size=0.2, random_state=7)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((38296, 1250), (38296, 6), (9575, 1250), (9575, 6))

In [13]:
# this section provides information of number of comments for each individual target
print("Full Labels")
get_count(target)
print("\nTrain Labels")
get_count(y_train)
print("\nTest Label")
get_count(y_test)

Full Labels
toxic comments: 4692
severe_toxic comments: 502
obscene comments: 2537
threat comments: 162
insult comments: 2357
identity_hate comments: 426
overall toxic comments: 4950
overall non-toxic comments: 42921

Train Labels
toxic comments: 3726
severe_toxic comments: 392
obscene comments: 2001
threat comments: 129
insult comments: 1879
identity_hate comments: 342
overall toxic comments: 3928
overall non-toxic comments: 34368

Test Label
toxic comments: 966
severe_toxic comments: 110
obscene comments: 536
threat comments: 33
insult comments: 478
identity_hate comments: 84
overall toxic comments: 1022
overall non-toxic comments: 8553


In [0]:
# CNN model params
vocab_size = len(vocab) + 1
output_dim = 30
n_target = 6
num_epochs = 100
batch_size=256
activation = 'sigmoid'
loss = 'binary_crossentropy'
optimizer = 'adam'

# CNN model
model = Sequential()
model.add(Embedding(vocab_size, output_dim, input_length=max_length))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=7, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(300, activation='relu'))
model.add(Dense(n_target, activation=activation))
print(model.summary())

In [15]:
# train model on dataset
model.compile(loss=loss, optimizer=optimizer, metrics=['categorical_accuracy'])
model.fit(x_train, y_train, epochs=num_epochs, batch_size=batch_size, verbose=0)
loss, acc = model.evaluate(x_test, y_test, verbose=0)
print(f'Loss: {loss}  -  Acc: {acc}')



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where








Loss: 0.2656205026998834  -  Acc: 0.9601044386422977


In [0]:
# this function generates an evaluation report for the model
def evaluate(x_test, y_test, model):
  label = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
  before = time.time()
  y_pred = model.predict(x_test)
  after = time.time()
  y_pred = np.round(y_pred)
  report = classification_report(y_test, y_pred, target_names=label)
  accuracy = accuracy_score(y_test, y_pred)
  num_pred = accuracy_score(y_test, y_pred, normalize=False)

  print(report)
  print(f'Accuracy: {accuracy * 100}%')
  print(f'Correctly Predicted: {num_pred}/{len(y_pred)}')
  print(f'Inference Time: {np.round(after - before, 2)} seconds')

In [17]:
print('Report for all samples with one or more toxic labels')
x_toxic, y_toxic = get_toxic_data(x_test, y_test, for_toxic=True)
evaluate(x_toxic, y_toxic, model)
print('\n')
print('Report for all non-toxic smaples')
x_non_toxic, y_non_toxic = get_toxic_data(x_test, y_test, for_toxic=False)
evaluate(x_non_toxic, y_non_toxic, model)
print('\n')
print('Report for all samples')
evaluate(x_test, y_test, model)

Report for all samples with one or more toxic labels
               precision    recall  f1-score   support

        toxic       0.97      0.68      0.80       966
 severe_toxic       0.45      0.35      0.39       110
      obscene       0.84      0.65      0.73       536
       threat       0.11      0.03      0.05        33
       insult       0.64      0.59      0.61       478
identity_hate       0.38      0.31      0.34        84

    micro avg       0.80      0.61      0.69      2207
    macro avg       0.57      0.43      0.49      2207
 weighted avg       0.80      0.61      0.69      2207
  samples avg       0.56      0.55      0.52      2207

Accuracy: 20.84148727984344%
Correctly Predicted: 213/1022
Inference Time: 0.16 seconds


Report for all non-toxic smaples
               precision    recall  f1-score   support

        toxic       0.00      0.00      0.00         0
 severe_toxic       0.00      0.00      0.00         0
      obscene       0.00      0.00      0.00      