In this notebook, we will find the optimal learing rate for the pooled RCNN model.

# 1. Loading Data

In [1]:
from IPython.display import clear_output

def hint(message):
    """
    erase previous ipynb output and show new message
    """
    clear_output()
    print(message)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

hint("loading data...")
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

labels = [
    'toxic', 
    'severe_toxic', 
    'obscene', 
    'threat', 
    'insult', 
    'identity_hate'
]

Y = train[labels].values

hint("Done")

Done


# 2. Pre-processing

## 2.1 Cleaning

Gather resources

In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
import nltk

nltk.download('wordnet')

lmtzr = WordNetLemmatizer()
eng_stopwords = (
    'what', 'which', 'who', 'whom', 
    'this', 'that', 'these', 'those', 
    'am', 'is', 'are', 'was', 'were', 
    'be', 'been', 'being', 
    'have', 'has', 'had', 'having', 
    'do', 'does', 'did', 'doing', 
    'a', 'an', 'the', 
    'and', 'but', 'if', 'or', 
    'because', 'as', 'until', 'while', 
    'of', 'at', 'by', 'for', 'with', 
    'about', 'against', 'between', 
    'into', 'through', 'during', 'before', 'after', 
    'above', 'below', 'to', 'from', 
    'up', 'down', 'in', 'out', 'on', 
    'over', 'under', 'again', 'further', 
    'then', 'once', 'here', 
    'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 
    'few', 'more', 'most', 'other', 'some', 
    'such', 'nor', 'only', 
    'own', 'same', 'so', 'than', 'too', 'very', 
    'can', 'will', 'just', 'don', 'should', 'now'
)
appos = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not"
}

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ChuanLi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Define cleaning method

In [4]:
from nltk.tokenize import TweetTokenizer
import re

tkzr = TweetTokenizer(preserve_case=False)

def preprocess(comment):
  
    # credit to the author of this post:
    # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda
    
    # remove urls
    comment = re.sub(r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]', ' speclinkaddress ', comment)
    comment = re.sub(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]', ' speclinkaddress ', comment)
    comment = re.sub(r'www.[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]', ' speclinkaddress ', comment)

    # remove IP addresses
    comment = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' specipaddress ', comment)

    # remove username
    comment = re.sub(r"\[\[User.*\]", ' specusername ', comment)
    comment = re.sub(r"\[\[User.*\|", ' specusername ', comment)
    
    # remove special characters
    comment = re.sub(r'[^A-Za-z\d\' ]', '', comment)

    # tokenization 
    tokens = comment.split()

    # aphostophe replacement
    tokens = [ appos[token] if token in appos else token for token in tokens]

    # remove stopwords
    tokens = [ token for token in tokens if not token in eng_stopwords ]

    # stemming
    tokens = [ lmtzr.lemmatize(token, 'v') for token in tokens]

    return " ".join(tokens)

Cleaning

In [5]:
hint("Cleaning train set...")
X = train['comment_text'].apply(lambda c: preprocess(c))
hint("Cleaning test set...")
X_ = test['comment_text'].apply(lambda c: preprocess(c))
hint("Done")

Done


## 2.2 Making Sequences

In [6]:
from keras.preprocessing import text as ktxt, sequence

vocab_max = 120000

hint("Fitting the tokenizer...")
tokenizer = ktxt.Tokenizer(num_words=vocab_max, char_level=True)
tokenizer.fit_on_texts(X)

hint("Tokenizing...")
X = tokenizer.texts_to_sequences(X)
X_ = tokenizer.texts_to_sequences(X_)

hint("Padding the sequences...")
max_comment_length = 512  # padded/cropped comment length
X = sequence.pad_sequences(X, maxlen=max_comment_length)
X_ = sequence.pad_sequences(X_, maxlen=max_comment_length)

hint("Done")

Done


# 3. Model and Training

## 3.1 Model Definition

In [7]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, CuDNNGRU, CuDNNLSTM, Flatten
from keras.layers import SpatialDropout1D, Dropout, TimeDistributed, Bidirectional
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.optimizers import Adam
from attention_decoder import AttentionDecoder
from capsulelayers import CapsuleLayer

def get_model():
    sequence_input = Input(shape=(max_comment_length, ))
    x = Embedding(vocab_max, 300)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = SpatialDropout1D(0.1)(x)
    x = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = SpatialDropout1D(0.1)(x)
    x = Conv1D(filters=256, kernel_size=3, padding='same', activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = SpatialDropout1D(0.1)(x)
    x = Conv1D(filters=512, kernel_size=3, padding='same', activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
#     x = Dropout(0.2)(x)
#     x = CapsuleLayer(num_capsule=10, dim_capsule=16, routings=5)(x)
#     x = Flatten()(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x)
    preds = Dense(6, activation="sigmoid")(x)
    return Model(sequence_input, preds)

model = get_model() 
model.summary()

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 512)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 512, 300)     36000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 512, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 512, 64)      57664       spatial_dropout1d_1[0][0]        
__________________________

## 3.2 Training

In [8]:
from sklearn.utils.class_weight import compute_class_weight

def get_class_weight(label_index):
    balanced = compute_class_weight(
        'balanced', 
        np.unique(Y[:, label_index]), 
        Y[:, label_index]
    )[1]
    return 2*balanced**0.5

class_weights = [ get_class_weight(i) for i in range(len(labels)) ]

epochs = 12
batch_size = 64
lr = 0.0005

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(lr=lr), 
    metrics=['accuracy']
)

history = model.fit(
    X, Y, 
    epochs=epochs, 
    batch_size=batch_size,
    class_weight=class_weights
)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


# 4. Submission

In [9]:
hint("Making prediction...")
Y_ = model.predict(X_)
hint("Done")

Done


In [10]:
import time

hint("Uploading...")
file_name = 'submission_' + time.strftime("%Y%m%d-%H%M%S") + '.csv'
sumbit_id = pd.DataFrame({'id': test['id']})
sumbit_labels = pd.DataFrame(Y_, columns=labels)
submission = pd.concat([sumbit_id, sumbit_labels], axis=1)
submission.to_csv(file_name, index=False)
hint("Done")

Done
