In this notebook, we will find the optimal learing rate for the pooled RCNN model.

# 1. Loading Data

In [1]:
from IPython.display import clear_output

def hint(message):
    """
    erase previous ipynb output and show new message
    """
    clear_output()
    print(message)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

hint('loading data...')
train = pd.read_csv('data/train.csv')
train, valid = train_test_split(train, test_size=0.2)

labels = [
    'toxic', 
    'severe_toxic', 
    'obscene', 
    'threat', 
    'insult', 
    'identity_hate'
]

Ytr = train[labels].values
Yva = valid[labels].values

hint('Label distribution between training and validation set:')
class_proportion = pd.DataFrame({
    'label': labels,
    'train': [np.mean(train[label]) for label in labels],
    'validation' : [np.mean(valid[label]) for label in labels],
})
print(class_proportion)

Label distribution between training and validation set:
           label     train  validation
0          toxic  0.096564    0.092966
1   severe_toxic  0.010270    0.008899
2        obscene  0.053166    0.052076
3         threat  0.003032    0.002851
4         insult  0.049798    0.047627
5  identity_hate  0.008750    0.009024


# 2. Pre-processing

## 2.1 Cleaning

Gather resources

In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
import nltk

nltk.download('wordnet')

lmtzr = WordNetLemmatizer()
eng_stopwords = (
    'what', 'which', 'who', 'whom', 
    'this', 'that', 'these', 'those', 
    'am', 'is', 'are', 'was', 'were', 
    'be', 'been', 'being', 
    'have', 'has', 'had', 'having', 
    'do', 'does', 'did', 'doing', 
    'a', 'an', 'the', 
    'and', 'but', 'if', 'or', 
    'because', 'as', 'until', 'while', 
    'of', 'at', 'by', 'for', 'with', 
    'about', 'against', 'between', 
    'into', 'through', 'during', 'before', 'after', 
    'above', 'below', 'to', 'from', 
    'up', 'down', 'in', 'out', 'on', 
    'over', 'under', 'again', 'further', 
    'then', 'once', 'here', 
    'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 
    'few', 'more', 'most', 'other', 'some', 
    'such', 'nor', 'only', 
    'own', 'same', 'so', 'than', 'too', 'very', 
    'can', 'will', 'just', 'don', 'should', 'now'
)
appos = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not"
}

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ChuanLi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Define cleaning method

In [4]:
from nltk.tokenize import TweetTokenizer
import re

tkzr = TweetTokenizer(preserve_case=False)

def preprocess(comment):
  
    # credit to the author of this post:
    # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda
    
    # remove urls
    comment = re.sub(r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]', ' speclinkaddress ', comment)
    comment = re.sub(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]', ' speclinkaddress ', comment)
    comment = re.sub(r'www.[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]', ' speclinkaddress ', comment)

    # remove IP addresses
    comment = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' specipaddress ', comment)

    # remove username
    comment = re.sub(r"\[\[User.*\]", ' specusername ', comment)
    comment = re.sub(r"\[\[User.*\|", ' specusername ', comment)
    
    # remove special characters
    comment = re.sub(r'[^A-Za-z\d\' ]', '', comment)

    # tokenization 
    tokens = comment.split()

    # aphostophe replacement
    tokens = [ appos[token] if token in appos else token for token in tokens]

    # remove stopwords
    tokens = [ token for token in tokens if not token in eng_stopwords ]

    # stemming
    tokens = [ lmtzr.lemmatize(token, 'v') for token in tokens]

    return " ".join(tokens)

Cleaning

In [5]:
hint('Cleaning train set...')
Xtr_text = train['comment_text'].apply(lambda c: preprocess(c))
hint('Cleaning test set...')
Xva_text = valid['comment_text'].apply(lambda c: preprocess(c))
hint('Done')

Done


## 2.2 Making Sequences

In [6]:
from keras.preprocessing import text as ktxt, sequence

vocab_max = 100000

hint('Fitting the tokenizer...')
tokenizer = ktxt.Tokenizer(num_words=vocab_max)
tokenizer.fit_on_texts(Xtr_text)

hint('Tokenizing...')
Xtr = tokenizer.texts_to_sequences(Xtr_text)
Xva = tokenizer.texts_to_sequences(Xva_text)

hint('padding the sequences...')
max_comment_length = 200  # padded/cropped comment length
Xtr = sequence.pad_sequences(Xtr, maxlen=max_comment_length)
Xva = sequence.pad_sequences(Xva, maxlen=max_comment_length)

hint('Done')

Done


## 2.3 Prepare Embedding Matrix

In [7]:
import csv

emb_file = 'preembedding/glove.6B.300d.txt'

hint("Loading pre-embedding file...")
emb = pd.read_table(emb_file, " ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

hint("Preparing embedding matrix...")
embedding_dimension = 300
embedding_matrix = np.random.normal(
    emb.mean(axis=0), 
    emb.std(axis=0), 
    (vocab_max, embedding_dimension)
)

hint("Done")

Done


In [8]:
hint("Constructing embedding matrix")
for word, i in tokenizer.word_index.items():
    if i < vocab_max and word in emb.index:
        embedding_matrix[i] = emb.loc[word].as_matrix()

hint("Done")

Done


In [9]:
del emb

# 3. Model and Training

## 3.1 Model Definition

In [12]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import SpatialDropout1D, Dropout, TimeDistributed, Bidirectional
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import Adam
from attention_decoder import AttentionDecoder

def get_model(input_dim, embedding_matrix, lr):
    sequence_input = Input(shape=(input_dim, ))
    x = Embedding(vocab_max, embedding_dimension, weights=[embedding_matrix])(sequence_input)
    x = SpatialDropout1D(0.5)(x)
    x = Conv1D(filters=256, kernel_size=3, padding='same', activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])
    x = Dropout(0.2)(x)
    preds = Dense(1, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(lr=lr), 
        metrics=['accuracy']
    )
    return model

models = [ get_model(max_comment_length, embedding_matrix, 0.0004) for _ in range(6) ]
print("Models ready")

Models ready


In [13]:
from sklearn.utils import class_weight

class_weights = []
for i in range(6):
    class_weights.append(class_weight.compute_class_weight(
        'balanced', 
        np.unique(Ytr[:, i]), 
        Ytr[:, i]
    ))
print("Class weight matrix: ")
class_weights

Class weight matrix: 


[array([0.55344276, 5.17790217]),
 array([ 0.50518818, 48.68649886]),
 array([0.52807585, 9.40444968]),
 array([  0.5015204 , 164.93023256]),
 array([ 0.52620384, 10.04058518]),
 array([ 0.50441366, 57.14234557])]

## 3.2 Training

In [None]:
from keras.callbacks import EarlyStopping

epochs = 10
batch_size = 64
val_loss_estop = EarlyStopping(monitor='val_loss')
loss_estop = EarlyStopping(monitor='loss', patience=1)
callback_list = [loss_estop, val_loss_estop]

history = []
for i, label in enumerate(labels):
    print('Fitting class \"%s\"' % label)
    history.append(models[i].fit(
        Xtr, Ytr[:, i], 
        epochs=epochs, 
        batch_size=batch_size,
        validation_data=(Xva, Yva[:, i]),
        callbacks=callback_list,
        class_weight=class_weights[i]
    ))
    print(' ')

Fitting class "toxic"
Train on 127656 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10

## 3.3 Prediction

In [None]:
Yva_ = np.zeros((Yva.shape[0], len(labels)))
for i, model in enumerate(models):
    Yva_[:, i] = model.predict(Xva)[:, 0]

# 4. Analysis

In [None]:
from sklearn import metrics

for i, label in enumerate(labels):
    fpr, tpr, th = metrics.roc_curve(Yva[:, i], Yva_[:, i], pos_label=1)
    print("AUC for label %s: %.4f" % (label, metrics.auc(fpr, tpr)))