In [None]:
import sys
!{sys.executable} -m pip install --upgrade keras

In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.layers import concatenate, CuDNNGRU
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import sys

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim
        
# path = '..'
# EMBEDDING_FILE=path+'/input/glove.840B.300d.txt'
# EMBEDDING_FILE=path+'/input/crawl-300d-2M.vec'
# TRAIN_DATA_FILE=path+'/input/train.csv'
# TEST_DATA_FILE=path+'/input/test.csv'

EMBEDDING_FILE='/public/models/fasttext/crawl-300d-2M.vec'
# EMBEDDING_FILE='/public/models/glove/glove.840B.300d.txt'
TRAIN_DATA_FILE='/public/toxic_comments/train.csv'
TEST_DATA_FILE='/public/toxic_comments/test.csv'

MAX_SEQUENCE_LENGTH = 150
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = 300
num_dense = 256
rate_drop_lstm = 0.2
rate_drop_dense = 0.2

act = 'relu'

In [3]:
def cleanData(text, stemming = False, lemmatize=False):    
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in text.split()])
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w) for w in text.split()])
    return text


cols_f = ['count_sent', 'count_word', 'count_unique_word', 'count_letters', 'count_punctuations', 
          'count_stopwords', 'mean_word_len', 'word_unique_percent', 'punct_percent', 'num_exclamation_marks'
          , 'num_question_marks', 'you_count']
cols_mm = ['count_sent', 'count_word', 'count_unique_word', 'count_letters', 'count_punctuations', 
           'count_stopwords', 'mean_word_len', 'word_unique_percent', 'punct_percent', 'num_exclamation_marks'
           ,'num_question_marks', 'you_count']

def get_features(df):
    df['count_sent']=df["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
    df['count_word']=df["comment_text"].apply(lambda x: len(str(x).split()))
    df['count_unique_word']=df["comment_text"].apply(lambda x: len(set(str(x).split())))
    df['count_letters']=df["comment_text"].apply(lambda x: len(str(x)))
    df["count_punctuations"] =df["comment_text"].apply(lambda x: len([c for c in str(x) if c in 
                                                                      string.punctuation]))
    df["count_stopwords"] = df["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
    df["mean_word_len"] = df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df['word_unique_percent']=df['count_unique_word']*100/df['count_word']
    df['punct_percent']=df['count_punctuations']*100/df['count_word']
    df['num_exclamation_marks'] = df['comment_text'].apply(lambda comment: comment.count('!'))
    df['num_question_marks'] = df['comment_text'].apply(lambda comment: comment.count('?'))
    df['you_count'] = df['comment_text'].apply(lambda comment: sum(comment.count(w) for w in ('you', 'You', 'YOU')))
    scaler = MinMaxScaler().fit(df[cols_mm])
    df[cols_mm] = scaler.transform(df[cols_mm])
    return df

train_df = get_features(train)
test_df = get_features(test)
train_df.head()

In [4]:
print('Indexing word vectors')

count = 0
embeddings_index = {}
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = ' '.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs.reshape(-1)
    coef = embeddings_index[word]
f.close()

print('Found %d word vectors of glove.' % len(embeddings_index))
emb_mean,emb_std = coef.mean(), coef.std()
print(emb_mean,emb_std)

print('Total %s word vectors.' % len(embeddings_index))

train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

print('Processing text dataset')

train_df['comment_text'] = train_df['comment_text'].map(lambda x: cleanData(x,  stemming = False, 
                                                                            lemmatize=False))
test_df['comment_text'] = test_df['comment_text'].map(lambda x: cleanData(x,  stemming = False, 
                                                                          lemmatize=False))

#Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
#regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    #Replace Numbers
    text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    return(text)


list_sentences_train = train_df["comment_text"].fillna("NA").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[list_classes].values
list_sentences_test = test_df["comment_text"].fillna("NA").values


comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))
    
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(comments + test_comments)

sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)

Indexing word vectors
Found 2195895 word vectors of glove.
-0.01444638 0.47249147
Total 2195895 word vectors.
Processing text dataset
Found 292462 unique tokens
Shape of data tensor: (159571, 100)
Shape of label tensor: (159571, 6)
Shape of test_data tensor: (153164, 100)


In [None]:
data_post = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH,padding='post', truncating='post')
print('Shape of data tensor:', data_post.shape)
print('Shape of label tensor:', y.shape)

test_data_post = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
print('Shape of test_data tensor:', test_data_post.shape)

In [5]:
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 21603


In [10]:
from keras.optimizers import Adam

adam_opt = Adam(lr=1e-3, decay=0.001)

def get_model():
    comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
    comment_input_post = Input(shape=(MAX_SEQUENCE_LENGTH,))

    x1 = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], 
                  input_length=MAX_SEQUENCE_LENGTH)(comment_input)
    x1 = SpatialDropout1D(0.4)(x1)
    x1 = Bidirectional(CuDNNGRU(80, return_sequences=True))(x1)
    avg_pool1 = GlobalAveragePooling1D()(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    
    x2 = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], 
                  input_length=MAX_SEQUENCE_LENGTH)(comment_input_post)
    x2 = SpatialDropout1D(0.4)(x2)
    x2 = Bidirectional(CuDNNGRU(80, return_sequences=True))(x2)
    avg_pool2 = GlobalAveragePooling1D()(x2)
    max_pool2 = GlobalMaxPooling1D()(x2)

    conc = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
    conc = Dropout(0.2)(conc)
    preds = Dense(6, activation="sigmoid")(conc)
    model = Model(inputs=[comment_input, comment_input_post], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer=adam_opt, metrics=['accuracy'])
    return model


from sklearn.metrics import log_loss
import numpy as np

test_predicts_list = []

def train_folds(data, data_post, y, fold_count, batch_size):
    print("Starting to train models...")
    fold_size = len(data) // fold_count
    models = []
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_size - 1:
            fold_end = len(data)

        print("Fold {0}".format(fold_id))
        
        train_x = np.concatenate([data[:fold_start], data[fold_end:]])
        train_xp = np.concatenate([data_post[:fold_start], data_post[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])

        val_x = data[fold_start:fold_end]
        val_xp = data_post[fold_start:fold_end]
        val_y = y[fold_start:fold_end]
        
        file_path="attngru_pp_fold{0}.h5".format(fold_id)
        model = get_model()
        checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        early = EarlyStopping(monitor="val_loss", mode="min", patience=1)
        callbacks_list = [checkpoint, early] 

        hist = model.fit([train_x, train_xp], train_y, epochs=10, batch_size=256, shuffle=True, 
                         validation_data=([val_x, val_xp], val_y), callbacks = callbacks_list, verbose=1)
        model.load_weights(file_path)
        best_score = min(hist.history['val_loss'])
        
        print("Fold {0} loss {1}".format(fold_id, best_score))
        print("Predicting results...")
        test_predicts_path = "attngru_pp_test_predicts{0}.npy".format(fold_id)
        test_predicts = model.predict([test_data, test_data_post], batch_size=1024, verbose=1)
        test_predicts_list.append(test_predicts)
        np.save(test_predicts_path, test_predicts)

In [11]:
train_folds(data, data_post, y, 10, 256)

Starting to train models...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Fold 0 loss 0.0416301521112232
Predicting results...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Fold 1 loss 0.04376676583336605
Predicting results...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Fold 2 loss 0.04299884445056204
Predicting results...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Fold 3 loss 0.045493070253055407
Predicting results...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Fold 4 loss 0.04150328691704437
Predicting results...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


Epoch 4/10
Epoch 5/10
Fold 5 loss 0.042830429837680095
Predicting results...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Fold 6 loss 0.041635003312791866
Predicting results...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Fold 7 loss 0.04449247287668859
Predicting results...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Fold 8 loss 0.04192517572922736
Predicting results...
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [12]:
CLASSES = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print(len(test_predicts_list))
test_predicts_am = np.zeros(test_predicts_list[0].shape)

for fold_predict in test_predicts_list:
    test_predicts_am += fold_predict

test_predicts_am = (test_predicts_am / len(test_predicts_list))

test_ids = test_df["id"].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts_am = pd.DataFrame(data=test_predicts_am, columns=CLASSES)
test_predicts_am["id"] = test_ids
test_predicts_am = test_predicts_am[["id"] + CLASSES]
test_predicts_am.to_csv("10fold_attngru_am.csv", index=False)

10


NameError: name 'CLASSES' is not defined

In [None]:
test_predicts = np.ones(test_predicts_list[0].shape)

for fold_predict in test_predicts_list:
    test_predicts *= fold_predict

test_predicts **= (1. / len(test_predicts_list))

test_ids = test_df["id"].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
test_predicts["id"] = test_ids
test_predicts = test_predicts[["id"] + CLASSES]
test_predicts.to_csv("10fold_attngru_gm.csv", index=False)