Used https://github.com/wongchunghang/toxic-comment-challenge-lstm/blob/master/toxic_comment_9872_model.ipynb for preprocessing

In [33]:
import numpy as np
import pandas as pd
import string
import re

import gensim
from collections import Counter
import pickle


from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import LSTM, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import MaxPool1D, Concatenate, Flatten
from keras.preprocessing import text, sequence

from keras.callbacks import Callback
from keras import optimizers
from keras.layers import Lambda

import warnings
warnings.filterwarnings('ignore')

import os

from keras import backend as K

from unidecode import unidecode

import time

from IPython.display import Image
from keras.utils.vis_utils import plot_model

In [2]:
# Load data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')


### Preprocessing
1. Remove non-ascii characters
2. Correct misspelling

In [3]:
special_character_removal=re.compile(r'[^a-z\?\!\#\@\%\* ]',re.IGNORECASE)
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

train['clean_text'] = train['comment_text'].apply(lambda x: clean_text(str(x)))
test['clean_text'] = test['comment_text'].apply(lambda x: clean_text(str(x)))

In [66]:
train['Word Count'] =  train['clean_text'].apply(lambda x: len(x.split (' ')))

In [67]:
print ('Min:', train['Word Count'].min())
print ('Median:', train['Word Count'].median())
print ('Max:', train['Word Count'].max())


Min: 1
Median: 36.0
Max: 2273


In [4]:
X_train = train['clean_text']
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test['clean_text']

In [68]:
# For best score (Public: 9869, Private: 9865), change to max_features = 283759, maxlen = 900
max_features = 40000 #top_words
maxlen = 500 #text_len

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_test_sequence = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train_sequence, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequence, maxlen=maxlen)
print(len(tokenizer.word_index))

457142


In [76]:
tokenizer.word_counts[list(tokenizer.word_index.keys())[50000]]

7

Use fasttext embedding

In [6]:
# Load the FastText Web Crawl vectors
EMBEDDING_FILE_FASTTEXT="../data/fasttesxt/crawl-300d-2M.vec"
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index_ft = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE_FASTTEXT,encoding='utf-8'))

In [7]:
spell_model = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE_FASTTEXT)

In [8]:
# This code is  based on: Spellchecker using Word2vec by CPMP
# https://www.kaggle.com/cpmpml/spell-checker-using-word2vec

words = spell_model.index2word

w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i

WORDS = w_rank

# Use fast text as vocabulary
def words(text): return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

def singlify(word):
    return "".join([letter for i,letter in enumerate(word) if i == 0 or letter != word[i-1]])

In [69]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words,301))

something_ft = embeddings_index_ft.get("something")

something = np.zeros((301,))
something[:300,] = something_ft
something[300,] = 0

In [10]:
#remove html tag like stylebackgroundcolor, verticalaligntop

In [70]:
def all_caps(word):
    return len(word) > 1 and word.isupper()

def embed_word(embedding_matrix,i,word):
    embedding_vector_ft = embeddings_index_ft.get(word)
    if embedding_vector_ft is not None: 
        if all_caps(word):
            last_value = np.array([1])
        else:
            last_value = np.array([0])
        embedding_matrix[i,:300] = embedding_vector_ft
        embedding_matrix[i,300] = last_value

            
# Fasttext vector is used by itself if there is no glove vector but not the other way around.
for word, i in word_index.items():
    
    if i >= max_features: continue
        
    if embeddings_index_ft.get(word) is not None:
        embed_word(embedding_matrix,i,word)
    else:
        # change to > 20 for better score.
        if len(word) > 20:
            embedding_matrix[i] = something
        else:
            word2 = correction(word)
            if embeddings_index_ft.get(word2) is not None:
                embed_word(embedding_matrix,i,word2)
            else:
                word2 = correction(singlify(word))
                if embeddings_index_ft.get(word2) is not None:
                    embed_word(embedding_matrix,i,word2)
                else:
                    embedding_matrix[i] = something

In [71]:
K.clear_session()
def get_model(clipvalue=0.5,num_filters=40,dropout=0.5,embed_size=301):
    inp = Input(shape=(maxlen, ))
    
    # Layer 1: concatenated fasttext and glove twitter embeddings.
    embedding = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    
    # Layer 2: SpatialDropout1D(0.5)
    x = SpatialDropout1D(dropout)(embedding)
    
    # Layer 3: Convolutional layer
    conv_0 = Conv1D(64, kernel_size=1, padding='valid', kernel_initializer='normal', activation='relu')(x)
    conv_1 = Conv1D(64, kernel_size=2, padding='valid', kernel_initializer='normal', activation='relu')(x)
    conv_2 = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='normal', activation='relu')(x)
    conv_3 = Conv1D(64, kernel_size=4, padding='valid', kernel_initializer='normal', activation='relu')(x)
    
    # Layer 4: max pooling (change to K-max pooling)
    maxpool_0 = MaxPool1D(pool_size=(maxlen - 1 + 1), strides=1, padding='valid')(conv_0)
    maxpool_1 = MaxPool1D(pool_size=(maxlen - 2 + 1), strides=1, padding='valid')(conv_1)
    maxpool_2 = MaxPool1D(pool_size=(maxlen - 3 + 1), strides=1, padding='valid')(conv_2)
    maxpool_3 = MaxPool1D(pool_size=(maxlen - 4 + 1), strides=1, padding='valid')(conv_3)
    
    # Layer 5: Concatenate
    concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
    flatten = Flatten()(concatenated_tensor)
    
    # Layer 6: Dropout
    drop = Dropout(dropout)(flatten)
    
    # Layer 7: Dense layer
    dense_1 = Dense(units=20, activation='relu')(drop)
    
    # Layer 8: Dense layer
    output = Dense(units=6, activation='sigmoid')(dense_1)
    model = Model(inputs=[inp], outputs=output)
    
    # compile
    adam = optimizers.adam(clipvalue=clipvalue)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    return model

In [72]:
get_model = get_model()
get_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 500, 301)     12040000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 500, 301)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 500, 64)      19328       spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
conv1d_2 (

In [35]:
# plot_model(get_model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
# Image('model_plot.png')

In [73]:
# NN batch size
NN_batch_size = 512

# Number of NN epochs
NN_epochs = 10
get_model.fit(x_train, y_train,batch_size=NN_batch_size, epochs=NN_epochs, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc3fb8993c8>

In [74]:
proba = get_model.predict(x_test)

In [75]:
# Create submission file
output=pd.DataFrame(data=proba, index=test["id"])
output.to_csv("./output/test_cnn_2.csv",header=["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
              ,index=True)