This notebook is made for producing submission for Kaggle. Analysis steps will be skipped. 

# 1. Loading Data

In [1]:
from IPython.display import clear_output

def hint(message):
    """
    erase previous ipynb output and show new message
    """
    clear_output()
    print(message)

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

hint("loading data...")
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

labels = [
    'toxic', 
    'severe_toxic', 
    'obscene', 
    'threat', 
    'insult', 
    'identity_hate'
]

Y = train[labels].values

hint("Done")

Done


# 2. Pre-processing

## 2.1 Cleaning

Gather resources

In [10]:
from nltk.stem.wordnet import WordNetLemmatizer
import nltk

nltk.download('wordnet')

lmtzr = WordNetLemmatizer()
eng_stopwords = (
    'what', 'which', 'who', 'whom', 
    'this', 'that', 'these', 'those', 
    'am', 'is', 'are', 'was', 'were', 
    'be', 'been', 'being', 
    'have', 'has', 'had', 'having', 
    'do', 'does', 'did', 'doing', 
    'a', 'an', 'the', 
    'and', 'but', 'if', 'or', 
    'because', 'as', 'until', 'while', 
    'of', 'at', 'by', 'for', 'with', 
    'about', 'against', 'between', 
    'into', 'through', 'during', 'before', 'after', 
    'above', 'below', 'to', 'from', 
    'up', 'down', 'in', 'out', 'on', 'off', 
    'over', 'under', 'again', 'further', 
    'then', 'once', 'here', 
    'there', 'when', 'where', 'why', 
    'how', 'all', 'any', 'both', 'each', 
    'few', 'more', 'most', 'other', 'some', 
    'such', 'no', 'nor', 'not', 'only', 
    'own', 'same', 'so', 'than', 'too', 'very', 
    'can', 'will', 'just', 'don', 'should', 'now'
)
appos = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not"
}

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ChuanLi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Define cleaning method

In [11]:
from nltk.tokenize import TweetTokenizer
import re

tkzr = TweetTokenizer(preserve_case=False)

def preprocess(comment):
  
    # credit to the author of this post:
    # https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda

    # remove special format
    comment = re.sub('\n\t', '', comment)

    # remove IP addresses
    comment = re.sub('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' specipaddress ', comment)

    # remove username
    comment = re.sub("\[\[User.*\]", ' specusername ', comment)
    comment = re.sub("\[\[User.*\|", ' specusername ', comment)

    # tokenization 
    tokens = tkzr.tokenize(comment)

    # aphostophe replacement
    tokens = [ appos[token] if token in appos else token for token in tokens]

    # remove stopwords
    tokens = [ token for token in tokens if not token in eng_stopwords ]

    # stemming
    tokens = [ lmtzr.lemmatize(token, 'v') for token in tokens]

    return " ".join(tokens)

Cleaning

In [15]:
hint("Cleaning train set...")
X = train['comment_text'].apply(lambda c: preprocess(c))
hint("Cleaning test set...")
X_ = test['comment_text'].apply(lambda c: preprocess(c))
hint("Done")

Done


## 2.2 Making Sequences

In [16]:
from keras.preprocessing import text as ktxt, sequence

vocab_max = 100000

hint("Fitting the tokenizer...")
tokenizer = ktxt.Tokenizer(num_words=vocab_max)
tokenizer.fit_on_texts(X)

hint("Tokenizing...")
X = tokenizer.texts_to_sequences(X)
X_ = tokenizer.texts_to_sequences(X_)

hint("Padding the sequences...")
max_comment_length = 200  # padded/cropped comment length
X = sequence.pad_sequences(X, maxlen=max_comment_length)
X_ = sequence.pad_sequences(X_, maxlen=max_comment_length)

hint("Done")

Done


## 2.3 Prepare Embedding Matrix

In [41]:
import csv

emb_file = 'preembedding/glove.6B.300d.txt'

hint("Loading pre-embedding file...")
emb = pd.read_table(emb_file, " ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

hint("Preparing embedding matrix...")
embedding_dimension = 300
embedding_matrix = np.random.normal(
    emb.mean(axis=0), 
    emb.std(axis=0), 
    (vocab_max, embedding_dimension)
)

hint("Done")

Done


In [42]:
from tqdm import tqdm

hint("Constructing embedding matrix")
for word, i in tqdm(tokenizer.word_index.items()):
    if i < vocab_max and word in emb.index:
        embedding_matrix[i] = emb.loc[word].as_matrix()

hint("Done")

Done


In [43]:
emb = None

# 3. Model

## 3.1 Model Definition

In [48]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, SpatialDropout1D 
from keras.layers import Bidirectional, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D, MaxPooling1D

model = Sequential()
model.add(Embedding(
    vocab_max, 
    embedding_dimension, 
    weights=[embedding_matrix],
    input_length=max_comment_length
))
model.add(SpatialDropout1D(0.5))
model.add(Conv1D(filters=256, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(GRU(units=64, return_sequences=True)))
model.add(GlobalMaxPooling1D())
#model.add(Dense(64, activation='relu'))
model.add(Dense(len(labels), activation='sigmoid'))
model.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy']
)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 300)          30000000  
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 200, 300)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 200, 256)          230656    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 100, 256)          0         
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 100, 256)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          123264    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
__________

## 3.2 Training

In [49]:
epochs = 2
batch_size = 64

def get_class_weight(x):
    k = 100
    return 3.32*np.log(k/x + 1)
    

history = model.fit(
    X, Y, 
    epochs=epochs, 
    batch_size=batch_size,
    verbose=2,
    class_weight={
        0: get_class_weight(98),
        1: get_class_weight(10),
        2: get_class_weight(53),
        3: get_class_weight(2),
        4: get_class_weight(49),
        5: get_class_weight(8),
    }
)

Epoch 1/2
 - 393s - loss: 0.1449 - acc: 0.9792
Epoch 2/2
 - 393s - loss: 0.1092 - acc: 0.9831


# 4. Submission

In [50]:
hint("Making prediction...")
Y_ = model.predict(X_)
hint("Done")

Done


In [51]:
import time

hint("Uploading...")
file_name = 'submission_' + time.strftime("%Y%m%d-%H%M%S") + '.csv'
sumbit_id = pd.DataFrame({'id': test['id']})
sumbit_labels = pd.DataFrame(Y_, columns=labels)
submission = pd.concat([sumbit_id, sumbit_labels], axis=1)
submission.to_csv(file_name, index=False)
hint("Done")

Done
