# 0. Import Dependencies

In [None]:
import numpy as np
import pandas as pd
import keras
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, Lambda
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tqdm.notebook import tqdm

# 1. Data Preprocessing

In [None]:
# Load the training dataset

data_path = 'new_train.csv'
data_raw = pd.read_csv(data_path)

data_raw = data_raw.sample(frac=1, replace=False)

print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
print("**Sample data:**")
data_raw.drop(columns='index', inplace=True)
data_raw.head()

In [None]:
# Generate a summary column "category".  The column contains "1" if the comment is labeled at least once.
# Otherwise, the column will take on a value of "0".

data_raw["category"] = data_raw.iloc[:,2:].sum(axis=1)
data_raw["category"] = data_raw["category"]/data_raw["category"]
data_raw.fillna(0, inplace=True)
data_raw.toxicity = data_raw.category.astype(int)
print("Total number of labeled comments is %d." %data_raw.category.sum())

In [None]:
data_raw.head()

In [5]:
data = data_raw

In [6]:
# Data Pre-processing

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)

In [None]:
# Retain relevant columns from the preprocessed dataset.
data = data[['id', 'comment_text', 'category']]

# Replace values in the column 'category' by {0: non-toxic, 1: toxic}.
data.loc[data.category == 0, 'category'] = 'non-toxic'
data.loc[data.category == 1, 'category'] = 'toxic'

# Replace index in-place by the 'id' column.
data.set_index('id', inplace=True)

In [None]:
data.head()

In [None]:
data.category.value_counts()

In [None]:
possible_labels = data.category.unique()
possible_labels

In [11]:
label_dict = {}

for index, possible_labels in enumerate(possible_labels):
    label_dict[possible_labels] = index


label_dict

In [None]:
data['label'] = data.category.replace(label_dict)
data.head()

In [None]:
data['non-toxic'] = 1 - data.label
data['toxic'] =  data.label
data.tail(20)

# 2. Train Split Data

In [16]:
max_feature = 20000 # maximum number of different words to embed
maxlen = 100 # maximum length in each comment_text.  Set to 200 for comparison with GLoVe and BERT.

X = data['comment_text'].values  # There should be NO na's.
list_classes = ['toxic', 'non-toxic']
y = data['label'].values

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 42,
    stratify = data.label.values
)

# 3. Embedding Function

In [19]:
# Creating embedding using tensorflow hub

import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
#To make tf 2.0 compatible with tf1.0 code, we disable the tf2.0 functionalities
tf.disable_eager_execution()

In [20]:
elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=True)

In [21]:
def ELMoEmbedding(x):
    # return elmo(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]
        return elmo(tf.reshape(tf.cast(x, tf.string), [-1]), signature="default", as_dict=True)["default"]

# 4. Build Model and Defining Metrics

In [None]:
def build_model(): 
    input_text = Input(shape=(1,), dtype="string")
    embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
    dense = Dense(10, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(embedding)
    pred = Dense(1, activation='sigmoid')(dense)
    model = Model(inputs=[input_text], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
model = build_model()

In [None]:
model.summary()

In [28]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model.summary()

# 5. Train Model

In [31]:
gpu_options = tf.GPUOptions(allow_growth=True)
# session = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))

In [None]:
from tensorflow.compat.v1.keras import backend as K

from keras.callbacks import ModelCheckpoint

batch = 16
epoch = 10

with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())

    checkpoint = ModelCheckpoint(
        "ELMo _Weights_{epoch:02d}_{val_loss:.2f}.hdf5",
        monitor='val_loss',
        verbose=1,
        save_best_only=False,
        mode='auto',
        save_weights_only=True,
        period=1)

    history = model.fit(
        X_train,
        y_train,
        epochs=epoch,
        batch_size=batch,
        validation_data=(X_val, y_val),
        verbose=1,
        callbacks=[checkpoint])
    
    model.save_weights('ELMo_Weights.h5')

# 6. Pull and Save Training History

In [None]:
history = pd.DataFrame(history.history)

In [None]:
history['epoch'] = [i + 1 for i in range(epoch)]

In [None]:
history

In [None]:
history.plot(x='epoch', y=['loss', 'val_loss'], figsize=(10,5), grid=True);
# history.plot(x='epoch', y=['val_f1_m'], figsize=(10,5), grid=True);

In [None]:
history.to_csv('ELMo.csv', index=False)

# 7. Load and Evaluate Model

In [None]:
# Load the test dataset

data_path = 'new_test.csv'
data_raw = pd.read_csv(data_path)

print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
print("**Sample data:**")
data_raw.head()

In [None]:
# Generate a summary column "category".  The column contains "1" if the comment is labeled at least once.
# Otherwise, the column will take on a value of "0".

data_raw["category"] = data_raw.iloc[:,2:8].sum(axis=1)
data_raw["category"] = data_raw["category"]/data_raw["category"]
data_raw.fillna(0, inplace=True)
data_raw.toxicity = data_raw.category.astype(int)
print("Total number of labeled comments is %d." %data_raw.category.sum())

In [None]:
test_data = data_raw

In [None]:
test_data['comment_text'] = test_data['comment_text'].str.lower()
test_data['comment_text'] = test_data['comment_text'].apply(cleanHtml)
test_data['comment_text'] = test_data['comment_text'].apply(cleanPunc)
test_data['comment_text'] = test_data['comment_text'].apply(keepAlpha)

In [None]:
# Retain relevant columns from the preprocessed dataset.
test_data = test_data[['id', 'comment_text', 'category']]

# Replace values in the column 'category' by {0: non-toxic, 1: toxic}.
test_data.loc[test_data.category == 0, 'category'] = 'non-toxic'
test_data.loc[test_data.category == 1, 'category'] = 'toxic'

# Replace index in-place by the 'id' column.
test_data.set_index('id', inplace=True)

In [None]:
test_data['label'] = test_data.category.replace(label_dict)

In [None]:
test_data['non-toxic'] = 1 - test_data.label
test_data['toxic'] =  test_data.label
test_data.tail(20)

In [None]:
X_test = test_data['comment_text'].values  # There should be NO na's.
list_classes = ['toxic', 'non-toxic']
y_test = test_data['label'].values

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    # preds_flat = np.argmax(preds, axis=1).flatten()
    # labels_flat = np.argmax(labels, axis=1).flatten()

    preds_flat = preds
    labels_flat = labels
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
# new_model = Model(inputs=[input_text], outputs=pred)

new_model = build_model()

new_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', f1_m])

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
from tensorflow.compat.v1.keras import backend as K

session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

K.set_session(session)
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())

new_model.load_weights('/content/ELMo _Weights_05_0.51.hdf5')

In [None]:
preds = new_model.predict(X_val, batch_size=16, verbose=1)

In [None]:
# prediction_results = pd.DataFrame(preds)
# prediction_results.to_csv('ELMo_Preds_Results.csv', index=False)

In [None]:
import copy

p = copy.copy(preds)
p = np.squeeze(p)
pp = np.where(p >= 0.5, 0, 1)
# pp = np.where(p >= 0.5, 1, 0)
results = pd.DataFrame({'pred':pp, 'true':y_val})
nt = results[results.true == 0]
t = results[results.true == 1]

In [None]:
nt = results[results.true == 0]
t = results[results.true == 1]

In [None]:
sum(nt['pred'] == nt['true'])

235

In [None]:
sum(t['pred'] == t['true'])

717

In [None]:
from sklearn.metrics import f1_score

f1_score(y_val, pp, average='weighted')

0.15723509872392136