In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import os
print(os.listdir("../input"))


In [None]:
import numpy as np
import pandas as pd
import string
import re

from collections import Counter
import pickle


from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import LSTM, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import MaxPool1D, Concatenate, Flatten
from keras.preprocessing import text, sequence

from keras.callbacks import Callback
from keras import optimizers
from keras.layers import Lambda

import warnings
warnings.filterwarnings('ignore')

import os

from keras import backend as K
from unidecode import unidecode
import time

In [None]:
import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))


In [None]:
# Load data
train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv')


## Preprocessing
1. Remove non-ascii characters
2. Correct misspelling

In [None]:
special_character_removal=re.compile(r'[^a-z\?\!\#\@\%\* ]',re.IGNORECASE)
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

train['clean_text'] = train['comment_text'].apply(lambda x: clean_text(str(x)))
test['clean_text'] = test['comment_text'].apply(lambda x: clean_text(str(x)))

In [None]:
X_train = train['clean_text']
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test['clean_text']

In [None]:
max_features = 50000 #top_words
maxlen = 900 #text_len

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_test_sequence = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train_sequence, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequence, maxlen=maxlen)
print(len(tokenizer.word_index))

Load precomputed embedding matrix

In [None]:
embedding_matrix = np.load("../input/precomputed-embedding-matrix/embedding_matrix_50000_301.npy")

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint#, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D
from keras.layers import Bidirectional

K.clear_session()
def get_model(clipvalue=0.5,dropout=0.3,embed_size=301):
    inp = Input(shape=(maxlen, ))
    
    # Layer 1: fasttext embeddings.
    embedding = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    
    # Layer 2: SpatialDropout1D(0.5)
    x = SpatialDropout1D(dropout)(embedding)
    
    # Layer 3: Bidirectional GRU + convolutional
    gru_1 = Bidirectional(GRU(units=40, return_sequences = True, recurrent_dropout = 0.1))(x)
    conv_1 = Conv1D(60, kernel_size=3, padding='valid', kernel_initializer='normal', activation='relu')(gru_1)
    
    # Layer 4: Bidirectional GRU + convolutional
    gru_2 = Bidirectional(GRU(units=80, return_sequences = True, recurrent_dropout = 0.1))(x)
    conv_2 = Conv1D(120, kernel_size=2, padding='valid', kernel_initializer='normal', activation='relu')(gru_2)
    
    # Max_pool + ave_pool
    avg_pool_1 = GlobalAveragePooling1D()(conv_1)
    max_pool_1 = GlobalMaxPooling1D()(conv_1)
    
    avg_pool_2 = GlobalAveragePooling1D()(conv_2)
    max_pool_2 = GlobalMaxPooling1D()(conv_2)
    
    # Concatenate
    x = concatenate([avg_pool_1, max_pool_1, avg_pool_2, max_pool_2])
    
    x = Dense(6, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    
    # compile
    adam = optimizers.adam(clipvalue=clipvalue)
    model.compile(loss = "binary_crossentropy", optimizer = adam, metrics = ["accuracy"])
    return model    

In [None]:
get_model = get_model()
get_model.summary()

In [None]:
file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
#ra_val = RocAucEvaluation(validation_data=(X_valid, Y_valid), interval = 1)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

get_model.fit(x_train, y_train, batch_size = 512, epochs = 4, 
                        verbose = 1, callbacks = [ check_point, early_stop])

In [None]:
proba = get_model.predict(x_test, batch_size=512, verbose=1)

In [None]:
# Create submission file
output=pd.DataFrame(data=proba, index=test["id"])
output.to_csv("global_average_pooling.csv",header=["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
              ,index=True)